Spaces:
Sleeping
Sleeping
File size: 8,183 Bytes
91a4436 a394efd 91a4436 c08d1fa 91a4436 aff05a7 afa581c af2b1fd afa581c a394efd 91a4436 a394efd 91a4436 a394efd afa581c 91a4436 a394efd 91a4436 aff05a7 91a4436 a394efd 91a4436 32b6873 91a4436 a394efd 41606a6 a394efd 41606a6 a394efd 41606a6 a394efd 32b6873 9aa07eb c08d1fa afa581c af2b1fd afa581c af2b1fd afa581c c15f38c afa581c c15f38c 6a3a524 afa581c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
import csv
import re
import pandas as pd
import pickle
import sqlite3
import gradio as gr
import os
from qatch.connectors.sqlite_connector import SqliteConnector
from qatch.evaluate_dataset.metrics_evaluators import CellPrecision, CellRecall, ExecutionAccuracy, TupleCardinality, TupleConstraint, TupleOrder, ValidEfficiencyScore
import qatch.evaluate_dataset.orchestrator_evaluator as eva
import utils_get_db_tables_info
#import tiktoken
from transformers import AutoTokenizer
def extract_tables(file_path):
conn = sqlite3.connect(file_path)
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tabelle = cursor.fetchall()
tabelle = [tabella for tabella in tabelle if tabella[0] != 'sqlite_sequence']
return tabelle
def extract_dataframes(file_path):
conn = sqlite3.connect(file_path)
tabelle = extract_tables(file_path)
dfs = {}
for tabella in tabelle:
nome_tabella = tabella[0]
df = pd.read_sql_query(f"SELECT * FROM {nome_tabella}", conn)
dfs[nome_tabella] = df
conn.close()
return dfs
def carica_sqlite(file_path, db_id):
data_output = {'data_frames': extract_dataframes(file_path),'db': SqliteConnector(relative_db_path=file_path, db_name=db_id)}
return data_output
# Funzione per leggere un file CSV
def load_csv(file):
df = pd.read_csv(file)
return df
# Funzione per leggere un file Excel
def carica_excel(file):
xls = pd.ExcelFile(file)
dfs = {}
for sheet_name in xls.sheet_names:
dfs[sheet_name] = xls.parse(sheet_name)
return dfs
def load_data(data_path : str, db_name : str):
data_output = {'data_frames': {} ,'db': None}
table_name = os.path.splitext(os.path.basename(data_path))[0]
if data_path.endswith(".sqlite") :
data_output = carica_sqlite(data_path, db_name)
elif data_path.endswith(".csv"):
data_output['data_frames'] = {f"{table_name}_table" : load_csv(data_path)}
elif data_path.endswith(".xlsx"):
data_output['data_frames'] = carica_excel(data_path)
else:
raise gr.Error("Formato file non supportato. Carica un file SQLite, CSV o Excel.")
return data_output
def read_api(api_key_path):
with open(api_key_path, "r", encoding="utf-8") as file:
api_key = file.read()
return api_key
def read_models_csv(file_path):
# Reads a CSV file and returns a list of dictionaries
models = [] # Change {} to []
with open(file_path, mode="r", newline="") as file:
reader = csv.DictReader(file)
for row in reader:
row["price"] = float(row["price"]) # Convert price to float
models.append(row) # Append to the list
return models
def csv_to_dict(file_path):
with open(file_path, mode='r', encoding='utf-8') as file:
reader = csv.DictReader(file)
data = []
for row in reader:
if "price" in row:
row["price"] = float(row["price"])
data.append(row)
return data
def increment_filename(filename):
base, ext = os.path.splitext(filename)
numbers = re.findall(r'\d+', base)
if numbers:
max_num = max(map(int, numbers)) + 1
new_base = re.sub(r'(\d+)', lambda m: str(max_num) if int(m.group(1)) == max(map(int, numbers)) else m.group(1), base)
else:
new_base = base + '1'
return new_base + ext
def prepare_prompt(prompt, question, schema, samples):
prompt = prompt.replace("{db_schema}", schema).replace("{question}", question)
prompt += f" Some instances: {samples}"
return prompt
def generate_some_samples(file_path, tbl_name):
conn = sqlite3.connect(file_path)
samples = []
query = f"SELECT * FROM {tbl_name} LIMIT 3"
try:
sample_data = pd.read_sql_query(query, conn)
samples.append(sample_data.to_dict(orient="records"))
#samples.append(str(sample_data))
except Exception as e:
samples.append(f"Error: {e}")
return samples
def load_tables_dict_from_pkl(file_path):
with open(file_path, 'rb') as f:
return pickle.load(f)
def extract_tables_dict(pnp_path):
return load_tables_dict_from_pkl('tables_dict_beaver.pkl')
tables_dict = {}
with open(pnp_path, mode='r', encoding='utf-8') as file:
reader = csv.DictReader(file)
tbl_db_pairs = set() # Use a set to avoid duplicates
for row in reader:
tbl_name = row.get("tbl_name")
db_path = row.get("db_path")
if tbl_name and db_path:
tbl_db_pairs.add((tbl_name, db_path)) # Add the pair to the set
for tbl_name, db_path in list(tbl_db_pairs):
if tbl_name and db_path:
connector = sqlite3.connect(db_path)
query = f"SELECT * FROM {tbl_name} LIMIT 5"
try:
df = pd.read_sql_query(query, connector)
tables_dict[tbl_name] = df
except Exception as e:
tables_dict[tbl_name] = pd.DataFrame({"Error": [str(e)]}) # DataFrame con messaggio di errore
#with open('tables_dict_beaver.pkl', 'wb') as f:
# pickle.dump(tables_dict, f)
return tables_dict
def extract_answer(df):
if "query" not in df.columns or "db_path" not in df.columns:
raise ValueError("The DataFrame must contain 'query' and 'data_path' columns.")
answers = []
for _, row in df.iterrows():
query = row["query"]
db_schema = row["db_schema"]
#db_path = row["db_path"]
try:
conn = utils_get_db_tables_info.create_db_temp(db_schema)
result = pd.read_sql_query(query, conn)
answer = result.values.tolist() # Convert the DataFrame to a list of lists
answers.append(answer)
conn.close()
except Exception as e:
answers.append(f"Error: {e}")
df["target_answer"] = answers
return df
evaluator = {
"cell_precision": CellPrecision(),
"cell_recall": CellRecall(),
"tuple_cardinality": TupleCardinality(),
"tuple_order": TupleOrder(),
"tuple_constraint": TupleConstraint(),
"execution_accuracy": ExecutionAccuracy(),
"valid_efficency_score": ValidEfficiencyScore()
}
def evaluate_answer(df):
for metric_name, metric in evaluator.items():
results = []
for _, row in df.iterrows():
target = row["target_answer"]
predicted = row["predicted_answer"]
try:
predicted = eval(predicted)
except Exception as e:
result = 0
else:
try:
result = metric.run_metric(target = target, prediction = predicted)
except Exception as e:
result = 0
results.append(result)
df[metric_name] = results
return df
models = [
"gpt-4o-mini",
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
]
def crop_entries_per_token(entries_list, model, prompt: str | None = None):
#open_ai_models = ["gpt-3.5", "gpt-4o-mini"]
dimension = 2048
#enties_string = [", ".join(map(str, entry)) for entry in entries_list]
if prompt:
entries_string = prompt.join(entries_list)
else:
entries_string = " ".join(entries_list)
#if model in ["deepseek-ai/DeepSeek-R1-Distill-Llama-70B" ,"gpt-4o-mini" ] :
#tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B")
tokens = tokenizer.encode(entries_string)
number_of_tokens = len(tokens)
if number_of_tokens > dimension and len(entries_list) > 4:
entries_list = entries_list[:round(len(entries_list)/2)]
entries_list = crop_entries_per_token(entries_list, model)
return entries_list
|