|
import os |
|
import json |
|
import glob |
|
import argparse |
|
|
|
|
|
def extract_unique_contexts(input_directory, output_directory): |
|
os.makedirs(output_directory, exist_ok=True) |
|
|
|
jsonl_files = glob.glob(os.path.join(input_directory, "*.jsonl")) |
|
print(f"Found {len(jsonl_files)} JSONL files.") |
|
|
|
for file_path in jsonl_files: |
|
filename = os.path.basename(file_path) |
|
name, ext = os.path.splitext(filename) |
|
output_filename = f"{name}_unique_contexts.json" |
|
output_path = os.path.join(output_directory, output_filename) |
|
|
|
unique_contexts_dict = {} |
|
|
|
print(f"Processing file: {filename}") |
|
|
|
try: |
|
with open(file_path, "r", encoding="utf-8") as infile: |
|
for line_number, line in enumerate(infile, start=1): |
|
line = line.strip() |
|
if not line: |
|
continue |
|
try: |
|
json_obj = json.loads(line) |
|
context = json_obj.get("context") |
|
if context and context not in unique_contexts_dict: |
|
unique_contexts_dict[context] = None |
|
except json.JSONDecodeError as e: |
|
print( |
|
f"JSON decoding error in file {filename} at line {line_number}: {e}" |
|
) |
|
except FileNotFoundError: |
|
print(f"File not found: {filename}") |
|
continue |
|
except Exception as e: |
|
print(f"An error occurred while processing file {filename}: {e}") |
|
continue |
|
|
|
unique_contexts_list = list(unique_contexts_dict.keys()) |
|
print( |
|
f"There are {len(unique_contexts_list)} unique `context` entries in the file {filename}." |
|
) |
|
|
|
try: |
|
with open(output_path, "w", encoding="utf-8") as outfile: |
|
json.dump(unique_contexts_list, outfile, ensure_ascii=False, indent=4) |
|
print(f"Unique `context` entries have been saved to: {output_filename}") |
|
except Exception as e: |
|
print(f"An error occurred while saving to the file {output_filename}: {e}") |
|
|
|
print("All files have been processed.") |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("-i", "--input_dir", type=str, default="../datasets") |
|
parser.add_argument( |
|
"-o", "--output_dir", type=str, default="../datasets/unique_contexts" |
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
extract_unique_contexts(args.input_dir, args.output_dir) |
|
|