Spaces:
Sleeping
Sleeping
from datasets import load_dataset | |
import json | |
# Load the dataset | |
base_url = "https://huggingface.co/datasets/jackyhate/text-to-image-2M/resolve/main/data_512_2M/data_{i:06d}.tar" | |
num_shards = 46 # Number of webdataset tar files | |
def download_data(base_url, num_shards): | |
# Download the data | |
urls = [base_url.format(i=i) for i in range(num_shards)] | |
dataset = load_dataset("webdataset", data_files={"train": urls}, split="train", streaming=True) | |
return dataset | |
def extract_prompts(dataset, json_file_path): | |
# Write data to the jsonl file | |
prompts = {} | |
with open(jsonl_file_path, 'w') as f: | |
for index, row in enumerate(dataset): | |
prompts[index] = row['json']['prompt'] | |
f.write(json.dumps(prompts[index]) + '\n') | |
def read_data(jsonl_file_path): | |
# Read data from the jsonl file | |
with open(jsonl_file_path, 'r') as f: | |
for line in f: | |
row = json.loads(line) | |
print(row) | |
def load_prompts_from_jsonl(file_path): | |
prompts = [] | |
with open(file_path, 'r') as f: | |
for line in f: | |
data = json.loads(line) # Each line is a JSON object | |
prompts.append(data) # Extract the 'prompt' field | |
return prompts | |
if __name__ == "__main__": | |
jsonl_file_path = r"C:\Users\jov2bg\Desktop\PromptSearch\search_engine\models\prompts_data.jsonl" | |
num_shards = 1 | |
dataset = download_data(num_shards, base_url) | |
extract_prompts(dataset, jsonl_file_path) | |
read_data(jsonl_file_path) |