import json import evaluate from tqdm import tqdm from transformers import pipeline # Load model pipeline pipe = pipeline("text2text-generation", model = "crossroderick/dalat5", tokenizer = "crossroderick/dalat5") # Load metrics bleu = evaluate.load("bleu") chrf = evaluate.load("chrf") # Load JSONL dataset dataset_path = "src/data/clean_corpus.jsonl" examples = [] with open(dataset_path, "r", encoding="utf-8") as f: for line in f: obj = json.loads(line) if "transliteration" in obj and "src" in obj["transliteration"] and "tgt" in obj["transliteration"]: examples.append((obj["transliteration"]["src"], obj["transliteration"]["tgt"])) # Run predictions predictions = [] references = [] print(f"Evaluating on {len(examples)} examples...\n") for src, tgt in tqdm(examples): input_prompt = f"Cyrillic2Latin: {src}" output = pipe(input_prompt, max_length = 128, do_sample = False)[0]["generated_text"] predictions.append(output.strip()) references.append([tgt.strip()]) # wrap in list for BLEU # Evaluate bleu_result = bleu.compute(predictions = predictions, references = references) chrf_result = chrf.compute(predictions = predictions, references = references) # Print results print("\nEvaluation results:") print(f"BLEU Score: {bleu_result['bleu']:.2f}") print(f"chrF Score: {chrf_result['score']:.2f}")