Spaces:

openpecha
/

translation_term_analyser

Sleeping

File size: 11,396 Bytes

20dc456

import json
from dataclasses import dataclass
from datetime import datetime
from enum import Enum
from typing import Dict, List, Optional

from claudette import Chat, models


@dataclass
class Context:
    tibetan: str
    english: str
    commentaries: List[str]
    sanskrit: Optional[str] = None


class AnalysisType(Enum):
    SEMANTIC = "semantic"
    TERM_GENERATION = "term_generation"
    EVALUATION = "evaluation"


class BuddhistTermAnalyzer:
    def __init__(self):
        # Use Claude 3.5 Sonnet
        self.model = models[1]  # claude-3-5-sonnet
        self.total_api_calls_cost = 0
        self.token_usage = {}

        # Initialize different chats for different analysis types
        self.system_prompts = {
            AnalysisType.SEMANTIC: """You are an expert in Buddhist terminology analysis with deep knowledge of Sanskrit and Tibetan.
            Analyze the given term through a systematic philological approach.
            You must ONLY respond with a valid JSON object, no other text.
            Never include any explanatory text before or after the JSON.

            Required JSON structure:
            {
                "sanskrit_analysis": {
                    "term": "string",  # Sanskrit equivalent
                    "morphology": "string",  # Morphological breakdown
                    "literal_meaning": "string",  # Literal meaning in Sanskrit
                    "technical_usage": "string"  # Technical usage in Sanskrit Buddhist literature
                },
                "tibetan_mapping": {
                    "term": "string",  # Tibetan term
                    "morphology": "string",  # Morphological breakdown of Tibetan
                    "translation_strategy": "string",  # How Tibetan translates the Sanskrit
                    "semantic_extension": "string"  # Any semantic changes or extensions in Tibetan
                },
                "commentary_insights": [
                    {
                        "source": "string",  # Which commentary
                        "explanation": "string",  # Key explanation
                        "technical_points": ["string"]  # Technical clarifications
                    }
                ],
                "english_renderings": [
                    {
                        "translation": "string",
                        "accuracy_score": number,  # 1-10
                        "captures_sanskrit": boolean,
                        "captures_tibetan": boolean,
                        "notes": "string"
                    }
                ],
                "semantic_synthesis": {
                    "core_meaning": "string",  # Core meaning synthesized from all sources
                    "technical_usage": ["string"],  # List of technical usages found in context
                    "connotative_aspects": ["string"]  # Important connotations and implications
                },
                "usage_examples": [
                    {
                        "source_text": "string",  # Original context
                        "usage_type": "string",  # How term is used here
                        "commentary_explanation": "string"  # What commentary says about this usage
                    }
                ]
            }""",
            AnalysisType.TERM_GENERATION: """You are an expert Buddhist translator.
            You must ONLY respond with a valid JSON object, no other text.
            Never include any explanatory text before or after the JSON.

            Required JSON structure:
            {
                "academic": {
                    "terms": ["term1", "term2"],
                    "reasoning": "string"
                },
                "practitioner": {
                    "terms": ["term1", "term2"],
                    "reasoning": "string"
                },
                "general": {
                    "terms": ["term1", "term2"],
                    "reasoning": "string"
                }
            }""",
            AnalysisType.EVALUATION: """You are an expert evaluator of Buddhist translations.
            You must ONLY respond with a valid JSON object, no other text.
            Never include any explanatory text before or after the JSON.

            Required JSON structure:
            {
                "evaluations": {
                    "term": {
                        "technical_score": 0.0,
                        "cultural_score": 0.0,
                        "audience_score": 0.0,
                        "reasoning": "string"
                    }
                }
            }""",
        }

        # Initialize chats with respective system prompts
        self.chats = {
            analysis_type: Chat(self.model, sp=system_prompt)
            for analysis_type, system_prompt in self.system_prompts.items()
        }

    def create_semantic_prompt(self, tibetan_term: str, contexts: List[Dict]) -> str:
        return f"""
        Analyze this Buddhist term following these steps:

        Target Term: {tibetan_term}

        Analysis Process:
        1. First analyze the Sanskrit source:
           - Identify the Sanskrit equivalent
           - Break down its morphology
           - Understand its literal and technical meanings

        2. Map to Tibetan:
           - Analyze how Tibetan translates the Sanskrit
           - Note any semantic extensions or modifications
           - Understand the translation strategy

        3. Study the commentaries:
           - Extract key explanations
           - Note technical clarifications
           - Identify special usages explained

        4. Evaluate English translations:
           - Compare against Sanskrit and Tibetan meanings
           - Assess accuracy and completeness
           - Note which aspects are captured/missed

        5. Synthesize understanding:
           - Combine insights from all sources
           - Document technical usage patterns
           - Note important connotations

        Contexts:
        {json.dumps(contexts, indent=2, ensure_ascii=False)}

        Important:
        - Base analysis strictly on provided contexts
        - Use commentaries to resolve ambiguities
        - Pay special attention to technical terms in commentaries
        - Note when English translations diverge from Sanskrit/Tibetan
        - Document specific usage examples from the context

        Remember: Return ONLY the JSON object with no other text."""

    def create_generation_prompt(
        self, tibetan_term: str, semantic_analysis: Dict
    ) -> str:
        return f"""
        Respond ONLY with a JSON object containing translation candidates:

        Term: {tibetan_term}

        Semantic Analysis:
        {json.dumps(semantic_analysis, indent=2, ensure_ascii=False)}

        Remember: Return ONLY the JSON object with no other text."""

    def create_evaluation_prompt(
        self, tibetan_term: str, candidates: Dict, semantic_analysis: Dict
    ) -> str:
        return f"""
        Respond ONLY with a JSON object evaluating these candidates:

        Term: {tibetan_term}

        Candidates:
        {json.dumps(candidates, indent=2, ensure_ascii=False)}

        Semantic Analysis:
        {json.dumps(semantic_analysis, indent=2, ensure_ascii=False)}

        Remember: Return ONLY the JSON object with no other text."""

    def _track_usage(self, analysis_type: AnalysisType, response):
        cost = self.chats[analysis_type].cost
        self.total_api_calls_cost += cost
        self.token_usage[str(analysis_type)] = {
            "token_usage": repr(response.usage),
            "api_call_cost": cost,
        }

    def analyze_term(self, tibetan_term: str, contexts: List[Dict]) -> Dict:
        """Main analysis pipeline using cached prompts"""

        # 1. Semantic Analysis with cache
        semantic_prompt = self.create_semantic_prompt(tibetan_term, contexts)
        semantic_response = self.chats[AnalysisType.SEMANTIC](semantic_prompt)
        self._track_usage(AnalysisType.SEMANTIC, semantic_response)
        semantic_analysis = json.loads(semantic_response.content[0].text)

        # 2. Term Generation with cache
        generation_prompt = self.create_generation_prompt(
            tibetan_term, semantic_analysis
        )
        generation_response = self.chats[AnalysisType.TERM_GENERATION](
            generation_prompt
        )
        self._track_usage(AnalysisType.TERM_GENERATION, generation_response)
        semantic_analysis = json.loads(semantic_response.content[0].text)
        candidates = json.loads(generation_response.content[0].text)

        # 3. Evaluation with cache
        evaluation_prompt = self.create_evaluation_prompt(
            tibetan_term, candidates, semantic_analysis
        )
        evaluation_response = self.chats[AnalysisType.EVALUATION](evaluation_prompt)
        self._track_usage(AnalysisType.EVALUATION, evaluation_response)
        evaluations = json.loads(evaluation_response.content[0].text)

        # Combine results
        return self.format_results(
            tibetan_term,
            semantic_analysis,
            candidates,
            evaluations,
        )

    def format_results(
        self,
        tibetan_term: str,
        semantic_analysis: Dict,
        candidates: Dict,
        evaluations: Dict,
    ) -> Dict:
        """Format the final results"""
        return {
            "tibetan_term": tibetan_term,
            "recommendations": {
                "Academic": {
                    "term": candidates["academic"]["terms"][0],
                    "reasoning": candidates["academic"]["reasoning"],
                },
                "Practitioner": {
                    "term": candidates["practitioner"]["terms"][0],
                    "reasoning": candidates["practitioner"]["reasoning"],
                },
                "General": {
                    "term": candidates["general"]["terms"][0],
                    "reasoning": candidates["general"]["reasoning"],
                },
            },
            "analysis": semantic_analysis,
            "evaluations": evaluations["evaluations"],
            "total_api_calls_cost": self.total_api_calls_cost,
            "token_usage": self.token_usage,
        }


class TermStandardizationAgent:
    def __init__(self):
        self.analyzer = BuddhistTermAnalyzer()

    def select_best_terms(self, tibetan_term: str, contexts: List[Dict]) -> Dict:
        """Main entry point for term standardization"""
        results = self.analyzer.analyze_term(tibetan_term, contexts)
        return results


# Example usage
def main():
    from pathlib import Path

    # Initialize agent
    agent = TermStandardizationAgent()

    # Test input
    tibetan_term = "བྱང་ཆུབ་སེམས་"
    contexts_fn = Path(__file__).parent / "data" / f"{tibetan_term}.json"
    contexts = json.load(contexts_fn.open())

    # Process term
    results = agent.select_best_terms(tibetan_term, contexts)
    date_time = datetime.now().strftime("%Y%m%d%H%M%S")
    results_path = Path(__file__).parent / "results"
    results_path.mkdir(exist_ok=True, parents=True)
    result_fn = results_path / f"{tibetan_term}_{date_time}.json"
    json.dump(results, result_fn.open("w"), indent=2, ensure_ascii=False)
    print(f"Results saved to: {result_fn}")


if __name__ == "__main__":
    main()