File size: 4,492 Bytes
50b2d56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import kaggle
import tempfile
import requests
import multiprocessing

import pandas as pd

from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor

def _generate_sources() -> pd.DataFrame:
	""" Generate a dataset containing urls to retrieve data from"""
	dataset = pd.DataFrame({'type': [], 'name': [], 'url': []})
	with tempfile.TemporaryDirectory() as temp_dir:
		kaggle.api.dataset_download_files('rohanrao/formula-1-world-championship-1950-2020', path=temp_dir, unzip=True)
		df = pd.read_csv(temp_dir + '/circuits.csv')
		
		# remove all columns except 'name' and 'url'
		df = df[['name', 'url']]
		df['type'] = 'circuit'
		dataset = pd.concat([dataset, df], ignore_index=True)

		# Drivers
		df = pd.read_csv(temp_dir + '/drivers.csv')

    	# remove all columns except 'forename', 'surname' and 'url'
		df = df[['forename', 'surname', 'url']]

    	# Join 'forename' and 'surname' columns
		df['name'] = df['forename'] + ' ' + df['surname']

		df = df[['name', 'url']]
		df['type'] = 'driver'

		dataset = pd.concat([dataset, df], ignore_index=True)

		# Constructors
		df = pd.read_csv(temp_dir + '/constructors.csv')

    	# Remove broken links
		df = df[(df['url'] != 'http://en.wikipedia.org/wiki/Turner_(constructor)') & (df['url'] != 'http://en.wikipedia.org/wiki/Hall_(constructor)')]

    	# remove all columns except 'name' and 'url'
		df = df[['name', 'url']]
		df['type'] = 'constructor'

		dataset = pd.concat([dataset, df], ignore_index=True)

    	# Races
		df = pd.read_csv(temp_dir + '/races.csv')

    	# remove all columns except 'name' and 'url'
		df['name'] = df['name'] + " " + df['year'].astype(str) + "-" + df['round'].astype(str)
		df = df[['name', 'url']]
		df['type'] = 'race'

		dataset = pd.concat([dataset, df], ignore_index=True)

    	# Seasons
		df = pd.read_csv(temp_dir + '/seasons.csv')

    	# remove all columns except 'year' and 'url'
		df = df[['year', 'url']]
		df['name'] = 'Year ' + df['year'].astype(str)
		
		df = df[['name', 'url']]
		df['type'] = 'season'

		dataset = pd.concat([dataset, df], ignore_index=True)
	
	return dataset
	
def _extract_paragraphs(url):
    response = requests.get(url)
    html = response.text

    soup = BeautifulSoup(html, "html.parser")

    pars = soup.find_all("p")
    pars = [p.get_text() for p in pars]
    return pars

def generate_trainset(persist: bool = True, persist_path: str = './datasets', filename='train.csv') -> pd.DataFrame:
	"""
	Generate the dataset used to train the model.
	
	Parameters:
    persist (bool): Whether to save the generated dataset to a file.
    persist_path (str): The directory where the generated dataset will be saved.
	filename (str): The name of the file to save the dataset.
	
    Returns:
    pd.DataFrame: The generated DataFrame.
	"""

	if os.path.exists(persist_path + '/' + filename):
		return pd.read_csv(f"{persist_path}/{filename}")
	
	sources = _generate_sources()

	num_threads = multiprocessing.cpu_count()
	with ThreadPoolExecutor(max_workers=num_threads) as executor:
		paragraphs = list(executor.map(_extract_paragraphs, sources['url']))
		paragraphs = [" ".join(p[0:5]).strip("\n") for p in paragraphs] 		# Take the first 4 paragraphs
		sources['description'] = paragraphs
	df = sources[['type', 'name', 'description']]

	if persist:
		os.makedirs(persist_path, exist_ok=True)
		df.to_csv(f"{persist_path}/{filename}", index=False)

	return df

def generate_ragset(persist=True, persist_path: str = './datasets', filename='rag.csv') -> pd.DataFrame:
	"""
	Generate the dataset used for Retrieval-Augmented Generation.
	
	Parameters:
    persist (bool): Whether to save the generated dataset to a file.
    persist_path (str): The directory where the generated dataset will be saved.
	filename (str): The name of the file to save the dataset.

    Returns:
    pd.DataFrame: The generated DataFrame.
	"""

	if os.path.exists(persist_path + '/' + filename):
		return pd.read_csv(f"{persist_path}/{filename}")
	
	sources = _generate_sources()

	num_threads = multiprocessing.cpu_count()
	with ThreadPoolExecutor(max_workers=num_threads) as executor:
		paragraphs = list(executor.map(_extract_paragraphs, sources['url']))
		paragraphs = [" ".join(p).strip("\n") for p in paragraphs]				# Take all the paragraphs
		sources['description'] = paragraphs
	df = sources[['type', 'name', 'description']]

	if persist:
		os.makedirs(persist_path, exist_ok=True)
		df.to_csv(f"{persist_path}/{filename}", index=False)

	return df