added download function and edited INFO
Browse files- .DS_Store +0 -0
- data/.DS_Store +0 -0
- data/INFO +1 -1
- evals/datasets_/flores.py +0 -2
- evals/download_data.py +299 -0
- evals/languages.py +3 -3
    	
        .DS_Store
    ADDED
    
    | Binary file (6.15 kB). View file | 
|  | 
    	
        data/.DS_Store
    ADDED
    
    | Binary file (6.15 kB). View file | 
|  | 
    	
        data/INFO
    CHANGED
    
    | @@ -1,5 +1,5 @@ | |
| 1 | 
             
            fleurs: https://huggingface.co/datasets/google/fleurs via eval.py
         | 
| 2 | 
            -
            floresp-v2.0-rc.3: https:// | 
| 3 | 
             
            glottolog_languoid.csv: https://glottolog.org/meta/downloads
         | 
| 4 | 
             
            ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
         | 
| 5 | 
             
            spbleu: https://github.com/facebookresearch/flores/tree/main/flores200#spm-and-dictionary
         | 
|  | |
| 1 | 
             
            fleurs: https://huggingface.co/datasets/google/fleurs via eval.py
         | 
| 2 | 
            +
            floresp-v2.0-rc.3: https://huggingface.co/datasets/openlanguagedata/flores_plus
         | 
| 3 | 
             
            glottolog_languoid.csv: https://glottolog.org/meta/downloads
         | 
| 4 | 
             
            ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
         | 
| 5 | 
             
            spbleu: https://github.com/facebookresearch/flores/tree/main/flores200#spm-and-dictionary
         | 
    	
        evals/datasets_/flores.py
    CHANGED
    
    | @@ -22,8 +22,6 @@ def aggregate_flores_paths(flores_paths): | |
| 22 | 
             
                ]
         | 
| 23 | 
             
                return flores_paths.values[populations.index(max(populations))]
         | 
| 24 |  | 
| 25 | 
            -
             | 
| 26 | 
            -
             | 
| 27 | 
             
            flores = pd.DataFrame(
         | 
| 28 | 
             
                [f.split(".")[1] for f in os.listdir(flores_dir)],
         | 
| 29 | 
             
                columns=["flores_path"],
         | 
|  | |
| 22 | 
             
                ]
         | 
| 23 | 
             
                return flores_paths.values[populations.index(max(populations))]
         | 
| 24 |  | 
|  | |
|  | |
| 25 | 
             
            flores = pd.DataFrame(
         | 
| 26 | 
             
                [f.split(".")[1] for f in os.listdir(flores_dir)],
         | 
| 27 | 
             
                columns=["flores_path"],
         | 
    	
        evals/download_data.py
    ADDED
    
    | @@ -0,0 +1,299 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # download_data.py
         | 
| 2 | 
            +
            import requests
         | 
| 3 | 
            +
            import tarfile
         | 
| 4 | 
            +
            import zipfile
         | 
| 5 | 
            +
            import io
         | 
| 6 | 
            +
            import pandas as pd
         | 
| 7 | 
            +
            from pathlib import Path
         | 
| 8 | 
            +
            import sys
         | 
| 9 | 
            +
            import huggingface_hub
         | 
| 10 | 
            +
            from datasets import load_dataset, DatasetDict
         | 
| 11 | 
            +
            # Import fleurs DataFrame directly from its source module
         | 
| 12 | 
            +
            from datasets_.fleurs import fleurs
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            # --- Configuration ---
         | 
| 15 | 
            +
             | 
| 16 | 
            +
             | 
| 17 | 
            +
            # Add project root to sys.path (still useful for potential future imports if needed)
         | 
| 18 | 
            +
            project_root = Path(__file__).resolve().parent
         | 
| 19 | 
            +
            if str(project_root) not in sys.path:
         | 
| 20 | 
            +
                sys.path.append(str(project_root))
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            DATA_DIR = project_root / "data"
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
         | 
| 25 | 
            +
            FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            FLORES_PLUS_HF_ID = "openlanguagedata/flores_plus"
         | 
| 28 | 
            +
            FLORES_TARGET_DIR = DATA_DIR / "floresp-v2.0-rc.3" / "dev_parquet" # Note: Saving as parquet
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
         | 
| 31 | 
            +
            GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
         | 
| 32 | 
            +
            GLOTTOLOG_CSV_NAME = "languoid.csv"
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html" # This is HTML, need manual download or parsing
         | 
| 35 | 
            +
            SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm" # Assumed direct link
         | 
| 38 | 
            +
            SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
         | 
| 39 | 
            +
            SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
         | 
| 40 | 
            +
            SPBLEU_DICT_URL = "https://dl.fbaipublicfiles.com/large_objects/nllb/models/spm_200/dictionary.txt"
         | 
| 41 | 
            +
            SPBLEU_DICT_NAME = "dictionary.txt"
         | 
| 42 | 
            +
             | 
| 43 | 
            +
             | 
| 44 | 
            +
            # --- Helper Functions ---
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            def download_file(url, path: Path):
         | 
| 47 | 
            +
                """Downloads a file from a URL to a local path."""
         | 
| 48 | 
            +
                print(f"Downloading {url} to {path}...")
         | 
| 49 | 
            +
                try:
         | 
| 50 | 
            +
                    response = requests.get(url, stream=True, timeout=60)
         | 
| 51 | 
            +
                    response.raise_for_status()  # Raise an exception for bad status codes
         | 
| 52 | 
            +
                    path.parent.mkdir(parents=True, exist_ok=True)
         | 
| 53 | 
            +
                    with open(path, "wb") as f:
         | 
| 54 | 
            +
                        for chunk in response.iter_content(chunk_size=8192):
         | 
| 55 | 
            +
                            f.write(chunk)
         | 
| 56 | 
            +
                    print(f"Successfully downloaded {path.name}.")
         | 
| 57 | 
            +
                except requests.exceptions.RequestException as e:
         | 
| 58 | 
            +
                    print(f"Error downloading {url}: {e}")
         | 
| 59 | 
            +
                except Exception as e:
         | 
| 60 | 
            +
                    print(f"An error occurred while saving {path}: {e}")
         | 
| 61 | 
            +
             | 
| 62 | 
            +
             | 
| 63 | 
            +
            def extract_tar_gz(tar_path: Path, extract_path: Path):
         | 
| 64 | 
            +
                """Extracts a .tar.gz file."""
         | 
| 65 | 
            +
                print(f"Extracting {tar_path} to {extract_path}...")
         | 
| 66 | 
            +
                try:
         | 
| 67 | 
            +
                    with tarfile.open(tar_path, "r:gz") as tar:
         | 
| 68 | 
            +
                        tar.extractall(path=extract_path)
         | 
| 69 | 
            +
                    print(f"Successfully extracted {tar_path.name}.")
         | 
| 70 | 
            +
                    # tar_path.unlink() # Optionally remove the archive after extraction
         | 
| 71 | 
            +
                except tarfile.TarError as e:
         | 
| 72 | 
            +
                    print(f"Error extracting {tar_path}: {e}")
         | 
| 73 | 
            +
                except Exception as e:
         | 
| 74 | 
            +
                    print(f"An unexpected error occurred during extraction: {e}")
         | 
| 75 | 
            +
             | 
| 76 | 
            +
             | 
| 77 | 
            +
            def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
         | 
| 78 | 
            +
                """Extracts a specific file from zip content in memory."""
         | 
| 79 | 
            +
                print(f"Extracting {target_filename} from zip data to {extract_path}...")
         | 
| 80 | 
            +
                try:
         | 
| 81 | 
            +
                    with zipfile.ZipFile(io.BytesIO(zip_content)) as z:
         | 
| 82 | 
            +
                        # Find the correct file within the zip structure
         | 
| 83 | 
            +
                        target_zip_path = None
         | 
| 84 | 
            +
                        for member in z.namelist():
         | 
| 85 | 
            +
                            if member.endswith(target_filename):
         | 
| 86 | 
            +
                                target_zip_path = member
         | 
| 87 | 
            +
                                break
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                        if target_zip_path:
         | 
| 90 | 
            +
                            with z.open(target_zip_path) as source, open(extract_path / target_filename, "wb") as target:
         | 
| 91 | 
            +
                                target.write(source.read())
         | 
| 92 | 
            +
                            print(f"Successfully extracted {target_filename}.")
         | 
| 93 | 
            +
                        else:
         | 
| 94 | 
            +
                            print(f"Error: Could not find {target_filename} within the zip archive.")
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                except zipfile.BadZipFile:
         | 
| 97 | 
            +
                    print("Error: Downloaded file is not a valid zip archive.")
         | 
| 98 | 
            +
                except Exception as e:
         | 
| 99 | 
            +
                    print(f"An error occurred during zip extraction: {e}")
         | 
| 100 | 
            +
             | 
| 101 | 
            +
             | 
| 102 | 
            +
            # --- Download Functions ---
         | 
| 103 | 
            +
             | 
| 104 | 
            +
            def download_fleurs_data():
         | 
| 105 | 
            +
                """Downloads Fleurs audio and text data."""
         | 
| 106 | 
            +
                print("\n--- Downloading Fleurs Data ---")
         | 
| 107 | 
            +
                FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                # Use the fleurs_tag column from the imported DataFrame
         | 
| 110 | 
            +
                fleurs_tags_list = fleurs['fleurs_tag'].tolist()
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                if not fleurs_tags_list:
         | 
| 113 | 
            +
                    print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
         | 
| 114 | 
            +
                    return
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                print(f"Checking/Downloading Fleurs for {len(fleurs_tags_list)} languages...")
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                for lang_tag in fleurs_tags_list:
         | 
| 119 | 
            +
                    lang_dir = FLEURS_TARGET_DIR / lang_tag
         | 
| 120 | 
            +
                    audio_dir = lang_dir / "audio"
         | 
| 121 | 
            +
                    dev_tsv_path = lang_dir / "dev.tsv"
         | 
| 122 | 
            +
                    dev_audio_archive_path = audio_dir / "dev.tar.gz"
         | 
| 123 | 
            +
                    audio_extracted_marker = audio_dir / "dev" # Check if extraction likely happened
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                    # Download TSV
         | 
| 126 | 
            +
                    if not dev_tsv_path.exists():
         | 
| 127 | 
            +
                        tsv_url = f"{FLEURS_BASE_URL}/{lang_tag}/dev.tsv"
         | 
| 128 | 
            +
                        download_file(tsv_url, dev_tsv_path)
         | 
| 129 | 
            +
                    else:
         | 
| 130 | 
            +
                        print(f"Found: {dev_tsv_path}")
         | 
| 131 | 
            +
             | 
| 132 | 
            +
                    # Download and Extract Audio
         | 
| 133 | 
            +
                    if not audio_extracted_marker.exists():
         | 
| 134 | 
            +
                        if not dev_audio_archive_path.exists():
         | 
| 135 | 
            +
                             tar_url = f"{FLEURS_BASE_URL}/{lang_tag}/audio/dev.tar.gz"
         | 
| 136 | 
            +
                             download_file(tar_url, dev_audio_archive_path)
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                        if dev_audio_archive_path.exists():
         | 
| 139 | 
            +
                             extract_tar_gz(dev_audio_archive_path, audio_dir)
         | 
| 140 | 
            +
                        else:
         | 
| 141 | 
            +
                            print(f"Audio archive missing, cannot extract for {lang_tag}")
         | 
| 142 | 
            +
                    else:
         | 
| 143 | 
            +
                         print(f"Found extracted audio: {audio_extracted_marker}")
         | 
| 144 | 
            +
             | 
| 145 | 
            +
            def download_flores_plus_data():
         | 
| 146 | 
            +
                """Downloads Flores+ data using Hugging Face datasets library."""
         | 
| 147 | 
            +
                print("\n--- Downloading Flores+ Data (requires HF login & accepted terms) ---")
         | 
| 148 | 
            +
                FLORES_TARGET_DIR.mkdir(parents=True, exist_ok=True)
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                try:
         | 
| 151 | 
            +
                    # Check login status first
         | 
| 152 | 
            +
                    token = huggingface_hub.HfFolder.get_token()
         | 
| 153 | 
            +
                    if not token:
         | 
| 154 | 
            +
                        print("Hugging Face token not found. Please log in using `huggingface-cli login`.")
         | 
| 155 | 
            +
                        print("You also need to accept the terms for 'openlanguagedata/flores_plus' on the HF website.")
         | 
| 156 | 
            +
                        return
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                    print(f"Attempting to download '{FLORES_PLUS_HF_ID}' (dev split)...")
         | 
| 159 | 
            +
                    # Load only the 'dev' split
         | 
| 160 | 
            +
                    ds = load_dataset(FLORES_PLUS_HF_ID, split='dev', verification_mode='no_checks')
         | 
| 161 | 
            +
             | 
| 162 | 
            +
                    # Save as parquet files, potentially one per language if needed later
         | 
| 163 | 
            +
                    # For simplicity now, save the whole dev split as one parquet file
         | 
| 164 | 
            +
                    target_file = FLORES_TARGET_DIR / "dev_split.parquet"
         | 
| 165 | 
            +
                    print(f"Saving dev split to {target_file}...")
         | 
| 166 | 
            +
                    ds.to_parquet(target_file)
         | 
| 167 | 
            +
                    print("Flores+ dev split downloaded and saved as parquet.")
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                except huggingface_hub.utils.GatedRepoError:
         | 
| 170 | 
            +
                    print(f"Error: Access to '{FLORES_PLUS_HF_ID}' is gated.")
         | 
| 171 | 
            +
                    print("Please ensure you are logged in (`huggingface-cli login`) and have accepted the terms ")
         | 
| 172 | 
            +
                    print(f"on the dataset page: https://huggingface.co/datasets/{FLORES_PLUS_HF_ID}")
         | 
| 173 | 
            +
                except Exception as e:
         | 
| 174 | 
            +
                    print(f"An error occurred downloading or saving Flores+: {e}")
         | 
| 175 | 
            +
             | 
| 176 | 
            +
             | 
| 177 | 
            +
            def download_glottolog_data():
         | 
| 178 | 
            +
                """Downloads and extracts Glottolog languoid CSV."""
         | 
| 179 | 
            +
                print("\n--- Downloading Glottolog Data ---")
         | 
| 180 | 
            +
                target_csv = GLOTTOLOG_TARGET_DIR / GLOTTOLOG_CSV_NAME
         | 
| 181 | 
            +
                if not target_csv.exists():
         | 
| 182 | 
            +
                    print(f"Downloading Glottolog zip from {GLOTTOLOG_URL}...")
         | 
| 183 | 
            +
                    try:
         | 
| 184 | 
            +
                        response = requests.get(GLOTTOLOG_URL, timeout=60)
         | 
| 185 | 
            +
                        response.raise_for_status()
         | 
| 186 | 
            +
                        GLOTTOLOG_TARGET_DIR.mkdir(parents=True, exist_ok=True)
         | 
| 187 | 
            +
                        extract_zip(response.content, GLOTTOLOG_TARGET_DIR, GLOTTOLOG_CSV_NAME)
         | 
| 188 | 
            +
                    except requests.exceptions.RequestException as e:
         | 
| 189 | 
            +
                        print(f"Error downloading Glottolog zip: {e}")
         | 
| 190 | 
            +
                    except Exception as e:
         | 
| 191 | 
            +
                        print(f"An error occurred processing Glottolog: {e}")
         | 
| 192 | 
            +
                else:
         | 
| 193 | 
            +
                    print(f"Found: {target_csv}")
         | 
| 194 | 
            +
             | 
| 195 | 
            +
             | 
| 196 | 
            +
            def download_scriptcodes_data():
         | 
| 197 | 
            +
                """Downloads ScriptCodes CSV."""
         | 
| 198 | 
            +
                print("\n--- Downloading ScriptCodes Data ---")
         | 
| 199 | 
            +
                # The URL points to an HTML page, not a direct CSV link.
         | 
| 200 | 
            +
                # Manual download is likely required for ScriptCodes.csv.
         | 
| 201 | 
            +
                print(f"Cannot automatically download from {SCRIPTCODES_URL}")
         | 
| 202 | 
            +
                print(f"Please manually download the ISO 15924 codes list (often available as a .txt file)")
         | 
| 203 | 
            +
                print("from the Unicode website or related sources and save it as:")
         | 
| 204 | 
            +
                print(f"{SCRIPTCODES_TARGET_FILE}")
         | 
| 205 | 
            +
                if SCRIPTCODES_TARGET_FILE.exists():
         | 
| 206 | 
            +
                    print(f"Note: File already exists at {SCRIPTCODES_TARGET_FILE}")
         | 
| 207 | 
            +
             | 
| 208 | 
            +
             | 
| 209 | 
            +
            def download_spbleu_data():
         | 
| 210 | 
            +
                """Downloads the SPM model and dictionary for spbleu."""
         | 
| 211 | 
            +
                print("\n--- Downloading spbleu SPM Model and Dictionary ---")
         | 
| 212 | 
            +
                SPBLEU_TARGET_DIR.mkdir(parents=True, exist_ok=True)
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                # Download SPM Model
         | 
| 215 | 
            +
                target_model_file = SPBLEU_TARGET_DIR / SPBLEU_SPM_NAME
         | 
| 216 | 
            +
                if not target_model_file.exists():
         | 
| 217 | 
            +
                    print(f"Downloading SPM Model...")
         | 
| 218 | 
            +
                    download_file(SPBLEU_SPM_URL, target_model_file)
         | 
| 219 | 
            +
                else:
         | 
| 220 | 
            +
                    print(f"Found: {target_model_file}")
         | 
| 221 | 
            +
             | 
| 222 | 
            +
                # Download Dictionary
         | 
| 223 | 
            +
                target_dict_file = SPBLEU_TARGET_DIR / SPBLEU_DICT_NAME
         | 
| 224 | 
            +
                if not target_dict_file.exists():
         | 
| 225 | 
            +
                    print(f"Downloading Dictionary...")
         | 
| 226 | 
            +
                    download_file(SPBLEU_DICT_URL, target_dict_file)
         | 
| 227 | 
            +
                else:
         | 
| 228 | 
            +
                    print(f"Found: {target_dict_file}")
         | 
| 229 | 
            +
             | 
| 230 | 
            +
            # --- Conversion Function ---
         | 
| 231 | 
            +
             | 
| 232 | 
            +
            def convert_flores_parquet_to_text():
         | 
| 233 | 
            +
                """Converts the downloaded Flores+ parquet dev split to text files."""
         | 
| 234 | 
            +
                print("\n--- Converting Flores+ Parquet to Text Files ---")
         | 
| 235 | 
            +
                parquet_file = FLORES_TARGET_DIR / "dev_split.parquet"
         | 
| 236 | 
            +
                text_dir = project_root / "data" / "floresp-v2.0-rc.3" / "dev" # Original expected dir
         | 
| 237 | 
            +
             | 
| 238 | 
            +
                if not parquet_file.exists():
         | 
| 239 | 
            +
                    print(f"Parquet file not found: {parquet_file}. Skipping conversion.")
         | 
| 240 | 
            +
                    return
         | 
| 241 | 
            +
             | 
| 242 | 
            +
                try:
         | 
| 243 | 
            +
                    print(f"Reading parquet file: {parquet_file}")
         | 
| 244 | 
            +
                    df = pd.read_parquet(parquet_file)
         | 
| 245 | 
            +
                    print(f"Read {len(df)} rows from parquet.")
         | 
| 246 | 
            +
             | 
| 247 | 
            +
                    if not all(col in df.columns for col in ['iso_639_3', 'iso_15924', 'text']):
         | 
| 248 | 
            +
                        print("Error: Parquet file missing required columns (iso_639_3, iso_15924, text).")
         | 
| 249 | 
            +
                        return
         | 
| 250 | 
            +
             | 
| 251 | 
            +
                    text_dir.mkdir(parents=True, exist_ok=True)
         | 
| 252 | 
            +
                    print(f"Target directory for text files: {text_dir}")
         | 
| 253 | 
            +
             | 
| 254 | 
            +
                    # Group by language and script to create individual files
         | 
| 255 | 
            +
                    grouped = df.groupby(['iso_639_3', 'iso_15924'])
         | 
| 256 | 
            +
                    count = 0
         | 
| 257 | 
            +
                    for (lang, script), group in grouped:
         | 
| 258 | 
            +
                        target_filename = f"dev.{lang}_{script}"
         | 
| 259 | 
            +
                        target_path = text_dir / target_filename
         | 
| 260 | 
            +
                        print(f"Writing {len(group)} sentences to {target_path}...")
         | 
| 261 | 
            +
                        try:
         | 
| 262 | 
            +
                            with open(target_path, 'w', encoding='utf-8') as f:
         | 
| 263 | 
            +
                                for sentence in group['text']:
         | 
| 264 | 
            +
                                    f.write(sentence + '\n')
         | 
| 265 | 
            +
                            count += 1
         | 
| 266 | 
            +
                        except Exception as e:
         | 
| 267 | 
            +
                            print(f"Error writing file {target_path}: {e}")
         | 
| 268 | 
            +
             | 
| 269 | 
            +
                    print(f"Successfully wrote {count} language/script files to {text_dir}.")
         | 
| 270 | 
            +
             | 
| 271 | 
            +
                except ImportError:
         | 
| 272 | 
            +
                    print("Error: pandas or pyarrow might be missing. Cannot read parquet.")
         | 
| 273 | 
            +
                    print("Please install them: pip install pandas pyarrow")
         | 
| 274 | 
            +
                except Exception as e:
         | 
| 275 | 
            +
                    print(f"An error occurred during parquet conversion: {e}")
         | 
| 276 | 
            +
             | 
| 277 | 
            +
             | 
| 278 | 
            +
            # --- Main Execution ---
         | 
| 279 | 
            +
             | 
| 280 | 
            +
            def main():
         | 
| 281 | 
            +
                """Runs all download functions and the conversion step."""
         | 
| 282 | 
            +
                print("Starting data download process...")
         | 
| 283 | 
            +
                DATA_DIR.mkdir(exist_ok=True)
         | 
| 284 | 
            +
             | 
| 285 | 
            +
                download_flores_plus_data()
         | 
| 286 | 
            +
                convert_flores_parquet_to_text()
         | 
| 287 | 
            +
                #download_fleurs_data()
         | 
| 288 | 
            +
                download_glottolog_data()
         | 
| 289 | 
            +
                download_scriptcodes_data()
         | 
| 290 | 
            +
                download_spbleu_data()
         | 
| 291 | 
            +
             | 
| 292 | 
            +
                print("\nData download process finished.")
         | 
| 293 | 
            +
                print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
         | 
| 294 | 
            +
                print("Note: Flores+ was downloaded as parquet, which might require changes but has been processed as well")
         | 
| 295 | 
            +
                print("in 'evals/datasets_/flores.py' to be read correctly.")
         | 
| 296 | 
            +
             | 
| 297 | 
            +
             | 
| 298 | 
            +
            if __name__ == "__main__":
         | 
| 299 | 
            +
                main() 
         | 
    	
        evals/languages.py
    CHANGED
    
    | @@ -1,9 +1,9 @@ | |
| 1 | 
             
            import re
         | 
| 2 |  | 
| 3 | 
             
            import pandas as pd
         | 
| 4 | 
            -
            from datasets_.commonvoice import commonvoice
         | 
| 5 | 
            -
            from datasets_.fleurs import fleurs
         | 
| 6 | 
            -
            from datasets_.flores import flores
         | 
| 7 | 
             
            from joblib.memory import Memory
         | 
| 8 | 
             
            from langcodes import Language, standardize_tag
         | 
| 9 | 
             
            from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
         | 
|  | |
| 1 | 
             
            import re
         | 
| 2 |  | 
| 3 | 
             
            import pandas as pd
         | 
| 4 | 
            +
            from .datasets_.commonvoice import commonvoice
         | 
| 5 | 
            +
            from .datasets_.fleurs import fleurs
         | 
| 6 | 
            +
            from .datasets_.flores import flores
         | 
| 7 | 
             
            from joblib.memory import Memory
         | 
| 8 | 
             
            from langcodes import Language, standardize_tag
         | 
| 9 | 
             
            from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
         | 

