MarioCerulo commited on
Commit
50b2d56
·
verified ·
1 Parent(s): f7dd461

Upload 3 files

Browse files
utils/__init__.py ADDED
File without changes
utils/dataset_utils.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import kaggle
3
+ import tempfile
4
+ import requests
5
+ import multiprocessing
6
+
7
+ import pandas as pd
8
+
9
+ from bs4 import BeautifulSoup
10
+ from concurrent.futures import ThreadPoolExecutor
11
+
12
+ def _generate_sources() -> pd.DataFrame:
13
+ """ Generate a dataset containing urls to retrieve data from"""
14
+ dataset = pd.DataFrame({'type': [], 'name': [], 'url': []})
15
+ with tempfile.TemporaryDirectory() as temp_dir:
16
+ kaggle.api.dataset_download_files('rohanrao/formula-1-world-championship-1950-2020', path=temp_dir, unzip=True)
17
+ df = pd.read_csv(temp_dir + '/circuits.csv')
18
+
19
+ # remove all columns except 'name' and 'url'
20
+ df = df[['name', 'url']]
21
+ df['type'] = 'circuit'
22
+ dataset = pd.concat([dataset, df], ignore_index=True)
23
+
24
+ # Drivers
25
+ df = pd.read_csv(temp_dir + '/drivers.csv')
26
+
27
+ # remove all columns except 'forename', 'surname' and 'url'
28
+ df = df[['forename', 'surname', 'url']]
29
+
30
+ # Join 'forename' and 'surname' columns
31
+ df['name'] = df['forename'] + ' ' + df['surname']
32
+
33
+ df = df[['name', 'url']]
34
+ df['type'] = 'driver'
35
+
36
+ dataset = pd.concat([dataset, df], ignore_index=True)
37
+
38
+ # Constructors
39
+ df = pd.read_csv(temp_dir + '/constructors.csv')
40
+
41
+ # Remove broken links
42
+ df = df[(df['url'] != 'http://en.wikipedia.org/wiki/Turner_(constructor)') & (df['url'] != 'http://en.wikipedia.org/wiki/Hall_(constructor)')]
43
+
44
+ # remove all columns except 'name' and 'url'
45
+ df = df[['name', 'url']]
46
+ df['type'] = 'constructor'
47
+
48
+ dataset = pd.concat([dataset, df], ignore_index=True)
49
+
50
+ # Races
51
+ df = pd.read_csv(temp_dir + '/races.csv')
52
+
53
+ # remove all columns except 'name' and 'url'
54
+ df['name'] = df['name'] + " " + df['year'].astype(str) + "-" + df['round'].astype(str)
55
+ df = df[['name', 'url']]
56
+ df['type'] = 'race'
57
+
58
+ dataset = pd.concat([dataset, df], ignore_index=True)
59
+
60
+ # Seasons
61
+ df = pd.read_csv(temp_dir + '/seasons.csv')
62
+
63
+ # remove all columns except 'year' and 'url'
64
+ df = df[['year', 'url']]
65
+ df['name'] = 'Year ' + df['year'].astype(str)
66
+
67
+ df = df[['name', 'url']]
68
+ df['type'] = 'season'
69
+
70
+ dataset = pd.concat([dataset, df], ignore_index=True)
71
+
72
+ return dataset
73
+
74
+ def _extract_paragraphs(url):
75
+ response = requests.get(url)
76
+ html = response.text
77
+
78
+ soup = BeautifulSoup(html, "html.parser")
79
+
80
+ pars = soup.find_all("p")
81
+ pars = [p.get_text() for p in pars]
82
+ return pars
83
+
84
+ def generate_trainset(persist: bool = True, persist_path: str = './datasets', filename='train.csv') -> pd.DataFrame:
85
+ """
86
+ Generate the dataset used to train the model.
87
+
88
+ Parameters:
89
+ persist (bool): Whether to save the generated dataset to a file.
90
+ persist_path (str): The directory where the generated dataset will be saved.
91
+ filename (str): The name of the file to save the dataset.
92
+
93
+ Returns:
94
+ pd.DataFrame: The generated DataFrame.
95
+ """
96
+
97
+ if os.path.exists(persist_path + '/' + filename):
98
+ return pd.read_csv(f"{persist_path}/{filename}")
99
+
100
+ sources = _generate_sources()
101
+
102
+ num_threads = multiprocessing.cpu_count()
103
+ with ThreadPoolExecutor(max_workers=num_threads) as executor:
104
+ paragraphs = list(executor.map(_extract_paragraphs, sources['url']))
105
+ paragraphs = [" ".join(p[0:5]).strip("\n") for p in paragraphs] # Take the first 4 paragraphs
106
+ sources['description'] = paragraphs
107
+ df = sources[['type', 'name', 'description']]
108
+
109
+ if persist:
110
+ os.makedirs(persist_path, exist_ok=True)
111
+ df.to_csv(f"{persist_path}/{filename}", index=False)
112
+
113
+ return df
114
+
115
+ def generate_ragset(persist=True, persist_path: str = './datasets', filename='rag.csv') -> pd.DataFrame:
116
+ """
117
+ Generate the dataset used for Retrieval-Augmented Generation.
118
+
119
+ Parameters:
120
+ persist (bool): Whether to save the generated dataset to a file.
121
+ persist_path (str): The directory where the generated dataset will be saved.
122
+ filename (str): The name of the file to save the dataset.
123
+
124
+ Returns:
125
+ pd.DataFrame: The generated DataFrame.
126
+ """
127
+
128
+ if os.path.exists(persist_path + '/' + filename):
129
+ return pd.read_csv(f"{persist_path}/{filename}")
130
+
131
+ sources = _generate_sources()
132
+
133
+ num_threads = multiprocessing.cpu_count()
134
+ with ThreadPoolExecutor(max_workers=num_threads) as executor:
135
+ paragraphs = list(executor.map(_extract_paragraphs, sources['url']))
136
+ paragraphs = [" ".join(p).strip("\n") for p in paragraphs] # Take all the paragraphs
137
+ sources['description'] = paragraphs
138
+ df = sources[['type', 'name', 'description']]
139
+
140
+ if persist:
141
+ os.makedirs(persist_path, exist_ok=True)
142
+ df.to_csv(f"{persist_path}/{filename}", index=False)
143
+
144
+ return df
utils/embedding_utils.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ from chromadb import Documents, Embeddings, EmbeddingFunction
3
+
4
+ class CustomEmbeddingFunction(EmbeddingFunction):
5
+ def __call__(self, text_chunks: Documents) -> Embeddings:
6
+ embedding_model = SentenceTransformer(
7
+ model_name_or_path="all-mpnet-base-v2",
8
+ device="cpu",
9
+ )
10
+
11
+ return embedding_model.encode(text_chunks)