In [2]:
from datasets import load_dataset
import json
import yaml

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os, getpass

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")


In [4]:
with open('README.md', 'r') as readme:
    lines = readme.readlines()
    
    # Find start and end of the language array in YAML format
    start_index = None
    for i, line in enumerate(lines):
        if line.strip() == "language:":
            start_index = i
            break
    
    end_index = len(lines)
    for j, line in enumerate(lines[start_index+1:], start=start_index+1):
        if not line.startswith('- '):
            end_index = j
            break

    language_section = ''.join(lines[start_index:end_index])
    
    # Load it with PyYAML
    readme_yaml = yaml.safe_load(language_section)
    langs = readme_yaml['language']

print(langs)

['bg', 'ca', 'code', 'cs', 'cy', 'da', 'de', 'el', 'en', 'es', 'et', 'eu', 'fi', 'fr', 'ga', 'gl', 'hr', 'hu', 'it', 'lt', 'lv', 'mt', 'nl', 'nn', '\\no', 'oc', 'pl', 'pt', 'ro', 'ru', 'sh', 'sk', 'sl', 'sr', 'sv', 'uk']


In [8]:
NUM_SAMPLES = 1000
#DATASET_NAME = "oscar-corpus/colossal-oscar-1.0"
DATASET_NAME = "oscar-corpus/oscar"
_set_env("HF_TOKEN")

In [9]:
if "colossal" in DATASET_NAME:
    ds = load_dataset(DATASET_NAME, "default", split="train", streaming=True)
    ds = ds.take(NUM_SAMPLES)

    for ex in ds:
        print(ex["content"])

In [10]:
samples = dict()

for lang in langs:
    if 2 <= len(lang) <= 3:
        try:
            lang = lang.replace("\\", "")
            name = "unshuffled_deduplicated_" + lang

            ds = load_dataset(DATASET_NAME, name, split="train", streaming=True)
            ds = ds.take(NUM_SAMPLES)
            samples[lang] = list(ds)
        except ValueError as e:
            print(e)
    else:
        print(f"lang {lang} not in corpus") # oscar not colossus
print(samples["es"])

lang code not in corpus
[{'id': 0, 'text': 'Como se librará de la celulitis en el gimnasio La piel superflua en las manos después del adelgazamiento, Los bailes fáciles para el adelgazamiento del vídeo La porción de comida al adelgazamiento. El sitio las revocaciones del adelgazamiento rf Si no hay pan blanco se puede adelgazar, Las grasas y los hidratos de carbono en los productos de la alimentación la tabla Adelgazar por medio del cacao.\nQue respetar que adelgaza Adelgazar para el querido, El régimen al dolor en el vientre al niño Los puntos del masaje para el adelgazamiento del vídeo. Adelgazar por los medios públicos por las hierbas Por qué adelgazas durante el sueño, Quien adelgazaba sobre los preparados Quiero la pastilla al adelgazamiento.\nIntercepta marina mexicana cargamento de droga flotando en alta mar Los ejercicios para el adelgazamiento de la persona en poco tiempo el vídeo Arrojar el peso en el ejército\nCon el jengibre y el limón para el adelgazamiento las revocacione

In [12]:
os.makedirs("imatrix/oscar/langs", exist_ok=True)

# Write all data to a single file "imatrix-dataset.txt" (flattened format with just text and language labels)
with open("imatrix/oscar/imatrix-dataset.txt", "w", encoding="utf-8") as imatrix_file:
    for lang, data in samples.items():
        for item in data:
            # Write the language and text content to the single file
            imatrix_file.write(f"{lang}: {item['text']}\n")

# Write each language's data to "imatrix/lang.txt" in JSON format
for lang, data in samples.items():
    with open(f"imatrix/oscar/langs/{lang}.txt", "w", encoding="utf-8") as lang_file:
        # Write each entry as a JSON object in a separate line for easier parsing
        for item in data:
            json.dump(item, lang_file)
            lang_file.write("\n")  # Add a newline for each JSON entry
    

In [13]:
import os
from collections import defaultdict

def count_results_by_language(dataset_dir):
    # Adjust this path to point to your language files directory
    lang_counts = defaultdict(int)
    
    for filename in os.listdir(dataset_dir):
        if filename.endswith(".txt"):  # Assuming language files are in .txt format
            lang_code = filename.split("-")[0]  # Assumes filename starts with lang code
            with open(os.path.join(dataset_dir, filename), "r", encoding="utf-8") as f:
                for line in f:
                    if line.strip():  # Only count non-empty lines
                        lang_counts[lang_code] += 1
    
    total_results = sum(lang_counts.values())
    
    # Print results summary
    print(f"Total Results: {total_results}")
    for lang, count in lang_counts.items():
        print(f"Language {lang}: {count} results")
    
    return lang_counts, total_results

# Example usage
dataset_directory = "./imatrix/oscar/langs"  # Change to your dataset path
count_results_by_language(dataset_directory)


Total Results: 35000
Language el.txt: 1000 results
Language sv.txt: 1000 results
Language pl.txt: 1000 results
Language en.txt: 1000 results
Language ru.txt: 1000 results
Language ga.txt: 1000 results
Language fr.txt: 1000 results
Language sr.txt: 1000 results
Language lv.txt: 1000 results
Language cy.txt: 1000 results
Language lt.txt: 1000 results
Language nn.txt: 1000 results
Language mt.txt: 1000 results
Language no.txt: 1000 results
Language nl.txt: 1000 results
Language uk.txt: 1000 results
Language cs.txt: 1000 results
Language oc.txt: 1000 results
Language ca.txt: 1000 results
Language bg.txt: 1000 results
Language fi.txt: 1000 results
Language da.txt: 1000 results
Language hu.txt: 1000 results
Language sh.txt: 1000 results
Language gl.txt: 1000 results
Language es.txt: 1000 results
Language sk.txt: 1000 results
Language ro.txt: 1000 results
Language et.txt: 1000 results
Language pt.txt: 1000 results
Language hr.txt: 1000 results
Language eu.txt: 1000 results
Language it.txt: 10

(defaultdict(int,
             {'el.txt': 1000,
              'sv.txt': 1000,
              'pl.txt': 1000,
              'en.txt': 1000,
              'ru.txt': 1000,
              'ga.txt': 1000,
              'fr.txt': 1000,
              'sr.txt': 1000,
              'lv.txt': 1000,
              'cy.txt': 1000,
              'lt.txt': 1000,
              'nn.txt': 1000,
              'mt.txt': 1000,
              'no.txt': 1000,
              'nl.txt': 1000,
              'uk.txt': 1000,
              'cs.txt': 1000,
              'oc.txt': 1000,
              'ca.txt': 1000,
              'bg.txt': 1000,
              'fi.txt': 1000,
              'da.txt': 1000,
              'hu.txt': 1000,
              'sh.txt': 1000,
              'gl.txt': 1000,
              'es.txt': 1000,
              'sk.txt': 1000,
              'ro.txt': 1000,
              'et.txt': 1000,
              'pt.txt': 1000,
              'hr.txt': 1000,
              'eu.txt': 1000,
              'it.txt'

In [22]:
import random

PPL_SAMPLE_SIZE_PER_LANG=50

# Assuming 'samples' is your dictionary with language keys and lists of text as values
subsampled_data = []

for lang, texts in samples.items():
    subsampled_data.extend(random.sample(texts, PPL_SAMPLE_SIZE_PER_LANG))  


In [24]:
# Save to a raw .txt file for PPL testing
output_file = "ppl_test_data.txt"

with open(output_file, 'w', encoding='utf-8') as f:
    for entry in subsampled_data:
        f.write(entry['text'] + "\n")

print(f"PPL test data saved to {output_file}")


PPL test data saved to ppl_test_data.txt
