Spaces:
Paused
Paused
File size: 4,492 Bytes
50b2d56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import os
import kaggle
import tempfile
import requests
import multiprocessing
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
def _generate_sources() -> pd.DataFrame:
""" Generate a dataset containing urls to retrieve data from"""
dataset = pd.DataFrame({'type': [], 'name': [], 'url': []})
with tempfile.TemporaryDirectory() as temp_dir:
kaggle.api.dataset_download_files('rohanrao/formula-1-world-championship-1950-2020', path=temp_dir, unzip=True)
df = pd.read_csv(temp_dir + '/circuits.csv')
# remove all columns except 'name' and 'url'
df = df[['name', 'url']]
df['type'] = 'circuit'
dataset = pd.concat([dataset, df], ignore_index=True)
# Drivers
df = pd.read_csv(temp_dir + '/drivers.csv')
# remove all columns except 'forename', 'surname' and 'url'
df = df[['forename', 'surname', 'url']]
# Join 'forename' and 'surname' columns
df['name'] = df['forename'] + ' ' + df['surname']
df = df[['name', 'url']]
df['type'] = 'driver'
dataset = pd.concat([dataset, df], ignore_index=True)
# Constructors
df = pd.read_csv(temp_dir + '/constructors.csv')
# Remove broken links
df = df[(df['url'] != 'http://en.wikipedia.org/wiki/Turner_(constructor)') & (df['url'] != 'http://en.wikipedia.org/wiki/Hall_(constructor)')]
# remove all columns except 'name' and 'url'
df = df[['name', 'url']]
df['type'] = 'constructor'
dataset = pd.concat([dataset, df], ignore_index=True)
# Races
df = pd.read_csv(temp_dir + '/races.csv')
# remove all columns except 'name' and 'url'
df['name'] = df['name'] + " " + df['year'].astype(str) + "-" + df['round'].astype(str)
df = df[['name', 'url']]
df['type'] = 'race'
dataset = pd.concat([dataset, df], ignore_index=True)
# Seasons
df = pd.read_csv(temp_dir + '/seasons.csv')
# remove all columns except 'year' and 'url'
df = df[['year', 'url']]
df['name'] = 'Year ' + df['year'].astype(str)
df = df[['name', 'url']]
df['type'] = 'season'
dataset = pd.concat([dataset, df], ignore_index=True)
return dataset
def _extract_paragraphs(url):
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
pars = soup.find_all("p")
pars = [p.get_text() for p in pars]
return pars
def generate_trainset(persist: bool = True, persist_path: str = './datasets', filename='train.csv') -> pd.DataFrame:
"""
Generate the dataset used to train the model.
Parameters:
persist (bool): Whether to save the generated dataset to a file.
persist_path (str): The directory where the generated dataset will be saved.
filename (str): The name of the file to save the dataset.
Returns:
pd.DataFrame: The generated DataFrame.
"""
if os.path.exists(persist_path + '/' + filename):
return pd.read_csv(f"{persist_path}/{filename}")
sources = _generate_sources()
num_threads = multiprocessing.cpu_count()
with ThreadPoolExecutor(max_workers=num_threads) as executor:
paragraphs = list(executor.map(_extract_paragraphs, sources['url']))
paragraphs = [" ".join(p[0:5]).strip("\n") for p in paragraphs] # Take the first 4 paragraphs
sources['description'] = paragraphs
df = sources[['type', 'name', 'description']]
if persist:
os.makedirs(persist_path, exist_ok=True)
df.to_csv(f"{persist_path}/{filename}", index=False)
return df
def generate_ragset(persist=True, persist_path: str = './datasets', filename='rag.csv') -> pd.DataFrame:
"""
Generate the dataset used for Retrieval-Augmented Generation.
Parameters:
persist (bool): Whether to save the generated dataset to a file.
persist_path (str): The directory where the generated dataset will be saved.
filename (str): The name of the file to save the dataset.
Returns:
pd.DataFrame: The generated DataFrame.
"""
if os.path.exists(persist_path + '/' + filename):
return pd.read_csv(f"{persist_path}/{filename}")
sources = _generate_sources()
num_threads = multiprocessing.cpu_count()
with ThreadPoolExecutor(max_workers=num_threads) as executor:
paragraphs = list(executor.map(_extract_paragraphs, sources['url']))
paragraphs = [" ".join(p).strip("\n") for p in paragraphs] # Take all the paragraphs
sources['description'] = paragraphs
df = sources[['type', 'name', 'description']]
if persist:
os.makedirs(persist_path, exist_ok=True)
df.to_csv(f"{persist_path}/{filename}", index=False)
return df |