|
|
|
|
|
|
|
import os |
|
from io import StringIO |
|
import requests |
|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
import openai |
|
import tiktoken |
|
|
|
from openai.embeddings_utils import get_embedding, cosine_similarity |
|
|
|
|
|
|
|
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings |
|
from langchain.vectorstores import FAISS |
|
from langchain.chat_models import ChatOpenAI |
|
from langchain.memory import ConversationBufferMemory |
|
from langchain.chains import ConversationalRetrievalChain |
|
from langchain.llms import OpenAI, HuggingFaceHub |
|
from langchain.chains.question_answering import load_qa_chain |
|
|
|
|
|
import ast |
|
|
|
from langchain.schema import Document |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
api_key = os.getenv("OPENAI_API_KEY") |
|
token = os.getenv("token") |
|
headers = { 'Authorization': f'token {token}', |
|
'Accept': 'application/vnd.github.v3.raw' } |
|
|
|
|
|
|
|
|
|
|
|
url_tomos_conf_DPR = os.getenv("url_tomos_conf_DPR") |
|
response_tomos_conf_DPR = requests.get( url_tomos_conf_DPR, headers = headers ) |
|
csv_content_tomos_conf_DPR = response_tomos_conf_DPR.text |
|
tomos_conf_DPR = pd.read_csv(StringIO( csv_content_tomos_conf_DPR )) |
|
|
|
|
|
url_tomos_conf_cita = 'https://api.github.com/repos/benjov/Data_Text_WM/contents/df/pages/04.%20Tomos%20Confidenciales%20WM%20(folios%20citados%20en%20DPR)/tomos_conf_cita.csv' |
|
response_tomos_conf_cita = requests.get( url_tomos_conf_cita, headers = headers ) |
|
csv_content_tomos_conf_cita = response_tomos_conf_cita.text |
|
tomos_conf_cita = pd.read_csv(StringIO( csv_content_tomos_conf_cita )) |
|
|
|
|
|
url_df_tomos_1a28_01 = 'https://api.github.com/repos/benjov/Data_Text_WM/contents/df/pages/01.%20Tomos%20Reservados/df_tomos_1a28_01.csv' |
|
response_df_tomos_1a28_01 = requests.get( url_df_tomos_1a28_01, headers = headers ) |
|
csv_content_df_tomos_1a28_01 = response_df_tomos_1a28_01.text |
|
df_tomos_1a28_01 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_01 )) |
|
|
|
|
|
url_df_tomos_1a28_02 = 'https://api.github.com/repos/benjov/Data_Text_WM/contents/df/pages/01.%20Tomos%20Reservados/df_tomos_1a28_02.csv' |
|
response_df_tomos_1a28_02 = requests.get( url_df_tomos_1a28_02, headers = headers ) |
|
csv_content_df_tomos_1a28_02 = response_df_tomos_1a28_02.text |
|
df_tomos_1a28_02 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_02 )) |
|
|
|
|
|
url_df_tomos_1a28_03 = 'https://api.github.com/repos/benjov/Data_Text_WM/contents/df/pages/01.%20Tomos%20Reservados/df_tomos_1a28_03.csv' |
|
response_df_tomos_1a28_03 = requests.get( url_df_tomos_1a28_03, headers = headers ) |
|
csv_content_df_tomos_1a28_03 = response_df_tomos_1a28_03.text |
|
df_tomos_1a28_03 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_03 )) |
|
|
|
|
|
url_df_tomos_1a28_04 = 'https://api.github.com/repos/benjov/Data_Text_WM/contents/df/pages/01.%20Tomos%20Reservados/df_tomos_1a28_04.csv' |
|
response_df_tomos_1a28_04 = requests.get( url_df_tomos_1a28_04, headers = headers ) |
|
csv_content_df_tomos_1a28_04 = response_df_tomos_1a28_04.text |
|
df_tomos_1a28_04 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_04 )) |
|
|
|
|
|
url_df_tomos_1a28_05 = 'https://api.github.com/repos/benjov/Data_Text_WM/contents/df/pages/01.%20Tomos%20Reservados/df_tomos_1a28_05.csv' |
|
response_df_tomos_1a28_05 = requests.get( url_df_tomos_1a28_05, headers = headers ) |
|
csv_content_df_tomos_1a28_05 = response_df_tomos_1a28_05.text |
|
df_tomos_1a28_05 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_05 )) |
|
|
|
|
|
url_df_tomos_1a28_06 = 'https://api.github.com/repos/benjov/Data_Text_WM/contents/df/pages/01.%20Tomos%20Reservados/df_tomos_1a28_06.csv' |
|
response_df_tomos_1a28_06 = requests.get( url_df_tomos_1a28_06, headers = headers ) |
|
csv_content_df_tomos_1a28_06 = response_df_tomos_1a28_06.text |
|
df_tomos_1a28_06 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_06 )) |
|
|
|
|
|
url_df_tomos_1a28_07 = 'https://api.github.com/repos/benjov/Data_Text_WM/contents/df/pages/01.%20Tomos%20Reservados/df_tomos_1a28_07.csv' |
|
response_df_tomos_1a28_07 = requests.get( url_df_tomos_1a28_07, headers = headers ) |
|
csv_content_df_tomos_1a28_07 = response_df_tomos_1a28_07.text |
|
df_tomos_1a28_07 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_07 )) |
|
|
|
|
|
url_df_tomos_1a28_08 = 'https://api.github.com/repos/benjov/Data_Text_WM/contents/df/pages/01.%20Tomos%20Reservados/df_tomos_1a28_08.csv' |
|
response_df_tomos_1a28_08 = requests.get( url_df_tomos_1a28_08, headers = headers ) |
|
csv_content_df_tomos_1a28_08 = response_df_tomos_1a28_08.text |
|
df_tomos_1a28_08 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_08 )) |
|
|
|
|
|
url_df_tomos_1a28_09 = 'https://api.github.com/repos/benjov/Data_Text_WM/contents/df/pages/01.%20Tomos%20Reservados/df_tomos_1a28_09.csv' |
|
response_df_tomos_1a28_09 = requests.get( url_df_tomos_1a28_09, headers = headers ) |
|
csv_content_df_tomos_1a28_09 = response_df_tomos_1a28_09.text |
|
df_tomos_1a28_09 = pd.read_csv(StringIO( csv_content_df_tomos_1a28_09 )) |
|
|
|
|
|
df_tomos_1a28 = pd.concat([df_tomos_1a28_01, df_tomos_1a28_02], ignore_index = True) |
|
df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_03], ignore_index = True) |
|
df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_04], ignore_index = True) |
|
df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_05], ignore_index = True) |
|
df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_06], ignore_index = True) |
|
df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_07], ignore_index = True) |
|
df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_08], ignore_index = True) |
|
df_tomos_1a28 = pd.concat([df_tomos_1a28, df_tomos_1a28_09], ignore_index = True) |
|
|
|
|
|
|
|
|
|
def clean_and_parse_embedding(embedding_str): |
|
embedding_str = embedding_str.split('[')[-1].split(']')[0] |
|
embedding_list = ast.literal_eval(embedding_str) |
|
return [float(val) for val in embedding_list] |
|
|
|
tomos_conf_DPR['Embedding'] = tomos_conf_DPR['Embedding'].apply(clean_and_parse_embedding) |
|
tomos_conf_cita['Embedding'] = tomos_conf_cita['Embedding'].apply(clean_and_parse_embedding) |
|
|
|
|
|
|
|
|
|
def parse_embedding(embedding_str): |
|
embedding_list = ast.literal_eval(embedding_str) |
|
return [float(val) for val in embedding_list] |
|
|
|
df_tomos_1a28['Embedding'] = df_tomos_1a28['Embedding'].apply(parse_embedding) |
|
|
|
|
|
|
|
|
|
list_of_dfs = [tomos_conf_DPR, tomos_conf_cita, df_tomos_1a28] |
|
|
|
|
|
|
|
|
|
def buscar(busqueda, lista_de_datos): |
|
resultados = [] |
|
busqueda_embed = get_embedding(busqueda, engine="text-embedding-ada-002") |
|
|
|
for datos in lista_de_datos: |
|
datos["similitud"] = datos['Embedding'].apply(lambda x: cosine_similarity(x, busqueda_embed)) |
|
datos = datos.sort_values("similitud", ascending=False) |
|
resultados.append(datos[['PDFName', 'PageNumber', 'similitud', "PageText", "Folder"]]) |
|
|
|
|
|
combined_result = pd.concat(resultados).sort_values("similitud", ascending=False).head(20) |
|
return combined_result |
|
|
|
|
|
|
|
|
|
def buscar_ai(busqueda, lista_de_datos): |
|
resultados = [] |
|
busqueda_embed = get_embedding(busqueda, engine="text-embedding-ada-002") |
|
|
|
for datos in lista_de_datos: |
|
datos["similitud"] = datos['Embedding'].apply(lambda x: cosine_similarity(x, busqueda_embed)) |
|
datos = datos.sort_values("similitud", ascending=False) |
|
resultados.append(datos[['PDFName', 'PageNumber', 'similitud', "PageText", "Folder"]]) |
|
|
|
|
|
combined_result = pd.concat(resultados).sort_values("similitud", ascending=False).head(10) |
|
return combined_result |
|
|
|
|
|
|
|
|
|
def count_text_extracted(pregunta): |
|
df = buscar(pregunta, list_of_dfs) |
|
pdf_counts = df.groupby(['Folder', 'PDFName'])['PageNumber'].count().reset_index() |
|
|
|
output_string = "" |
|
for idx, row in pdf_counts.iterrows(): |
|
folder_name = row['Folder'] |
|
pdf_name = row['PDFName'] |
|
count = row['PageNumber'] |
|
page_numbers = df[(df['PDFName'] == pdf_name) & (df['Folder'] == folder_name)]['PageNumber'].tolist() |
|
page_numbers_str = ', '.join(map(str, page_numbers)) |
|
output_string += f"Usé el archivo '{pdf_name}' del folder '{folder_name}' {count} (vez/veces) al extraer el texto de las páginas {page_numbers_str}.\n\n" |
|
|
|
return output_string |
|
|
|
|
|
|
|
|
|
def print_pdf_info(pregunta): |
|
df = buscar(pregunta, list_of_dfs) |
|
|
|
output_string = "" |
|
|
|
for _, row in df.iterrows(): |
|
pdf_name = row['PDFName'] |
|
page_number = row['PageNumber'] |
|
page_text = row['PageText'] |
|
|
|
|
|
indented_page_text = '\n'.join(['\t' + line for line in page_text.split('\n')]) |
|
|
|
|
|
output_string += f'De "{pdf_name}":\n \tPágina {page_number}:\n\t ------------------------------------------------------------------------------------------------------------------------------------\n{indented_page_text}\n------------------------------------------------------------------------------------------------------------------------------------\n' |
|
|
|
return output_string |
|
|
|
|
|
|
|
def vector_document(dataframe): |
|
string_vectors = dataframe["PageText"] |
|
documents = [Document(page_content=content, metadata={'id': i}) for i, content in enumerate(string_vectors)] |
|
return documents |
|
|
|
|
|
|
|
|
|
def get_completion_from_messages( messages, model = "gpt-3.5-turbo-16k", |
|
temperature = 0, max_tokens = 4500 ): |
|
response = openai.ChatCompletion.create( |
|
model = model, |
|
messages = messages, |
|
temperature = temperature, |
|
max_tokens = max_tokens, |
|
) |
|
return response.choices[0].message["content"] |
|
|
|
def get_topic( user_message ): |
|
|
|
delimiter = "####" |
|
system_message = f""" |
|
Eres un abogado que trabaja en temas de competencia económica e investiga casos en México. |
|
Siempre intenarás responder en el mayor número posible de palabras. |
|
Las consultas o preguntas se delimitarán con los caracteres {delimiter} |
|
""" |
|
|
|
messages = [ |
|
{'role':'system', |
|
'content': system_message}, |
|
{'role':'user', |
|
'content': f"{delimiter}{user_message}{delimiter}"}, |
|
] |
|
return get_completion_from_messages( messages ) |
|
|
|
def get_respuesta( user_message, informacion): |
|
|
|
delimiter = "####" |
|
system_message = f""" |
|
Eres un abogado que trabaja en temas de competencia económica e investiga casos en México. |
|
Siempre intenarás responder en el mayor número posible de palabras. |
|
Las consultas o preguntas se delimitarán con los caracteres {delimiter} |
|
|
|
""" |
|
|
|
messages = [ |
|
{'role':'system', |
|
'content': system_message}, |
|
{'role':'user', |
|
'content': f""" |
|
{delimiter} |
|
Estás intentando recopilar información relevante para tu caso. |
|
Usa exclusivamente la información contenida en la siguiente lista: |
|
{informacion} |
|
|
|
para responder sin límite de palabras lo siguiente: {user_message} |
|
Responde de forma detallada. |
|
{delimiter} |
|
"""}, |
|
] |
|
|
|
return get_completion_from_messages(messages) |
|
|
|
def chat(user_message_1): |
|
norma_y_tema_response_1 = get_topic(user_message_1) |
|
norma_y_tema_response_1 = norma_y_tema_response_1 |
|
norma_y_tema_response_1 += 'Todos' |
|
uno = buscar_ai(user_message_1, list_of_dfs) |
|
lista_info = uno[ 'PageText' ].tolist() |
|
return get_respuesta(user_message_1, lista_info) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llm_mio = OpenAI(openai_api_key=api_key, model_name="gpt-3.5-turbo-16k") |
|
|
|
def load(query): |
|
pregunta = query |
|
uno = buscar(pregunta, list_of_dfs) |
|
documents = vector_document(uno) |
|
chain = load_qa_chain(llm=llm_mio, chain_type="stuff") |
|
result1 = chain.run(input_documents=documents, question=pregunta) |
|
return result1 |
|
|
|
with gr.Blocks() as demo: |
|
txt = gr.Textbox(label="Pregunta", lines=2) |
|
btn = gr.Button(value="Buscar") |
|
txt_2 = gr.Textbox(value="", label="Donde:") |
|
txt_3 = gr.Textbox(value="", label="Cuales:") |
|
txt_1 = gr.Textbox(value="", label="Respuesta IA:") |
|
btn.click(load, inputs=[txt], outputs=[txt_1]) |
|
btn.click(count_text_extracted, inputs=[txt], outputs=[txt_2]) |
|
btn.click(print_pdf_info, inputs=[txt], outputs=[txt_3]) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |