Syluh27 commited on
Commit
122e667
·
1 Parent(s): 03e0d76
Files changed (2) hide show
  1. model.py +42 -33
  2. requirements.txt +2 -1
model.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from langchain.chains import RetrievalQA
2
  from langchain.vectorstores import Chroma
3
  from langchain.embeddings import HuggingFaceEmbeddings
@@ -7,70 +8,78 @@ from huggingface_hub import hf_hub_download
7
  import os
8
  import shutil
9
 
10
- # Configuración esencial para Spaces
11
  HF_TOKEN = os.getenv("HF_TOKEN")
12
  MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
 
13
 
14
- # 1. Configurar rutas específicas para Spaces
15
- CHROMA_DIR = "/home/user/app/chroma_db" # Ruta dentro del espacio persistente
16
- os.makedirs(CHROMA_DIR, exist_ok=True)
17
 
 
 
 
 
 
 
18
 
19
- # 2. Limpieza inicial de conflictos
20
- def clean_space():
21
- paths_to_clean = [
22
- "/home/user/.cache/huggingface/hub/datasets--VictorCarr02--Conversational-Agent-LawsEC",
23
- CHROMA_DIR
24
- ]
25
 
26
- for path in paths_to_clean:
27
- if os.path.exists(path):
28
- shutil.rmtree(path, ignore_errors=True)
29
- os.makedirs(CHROMA_DIR, exist_ok=True)
30
 
31
 
32
- clean_space()
 
 
 
 
 
 
 
 
33
 
34
- # 3. Descargar y mover chroma.sqlite3
35
- chroma_source = hf_hub_download(
36
- repo_id="VictorCarr02/Conversational-Agent-LawsEC",
37
- repo_type="dataset",
38
- filename="chroma.sqlite3",
39
- token=HF_TOKEN,
40
- force_download=True
41
- )
42
 
43
- # Mover al directorio controlado
44
- shutil.move(chroma_source, os.path.join(CHROMA_DIR, "chroma.sqlite3"))
45
 
46
- # 4. Inicializar ChromaDB
47
- chroma_client = chromadb.PersistentClient(path=CHROMA_DIR)
48
- collection = chroma_client.get_or_create_collection("legal_docs")
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- # 5. Configurar embeddings (optimizado para Spaces)
51
  embeddings = HuggingFaceEmbeddings(
52
  model_name="sentence-transformers/all-mpnet-base-v2",
53
  model_kwargs={"device": "cpu"}
54
  )
55
 
56
- # 6. Crear vector store
57
  vector_store = Chroma(
58
  client=chroma_client,
59
  collection_name="legal_docs",
60
  embedding_function=embeddings
61
  )
62
 
63
- # 7. Configurar Mistral
64
  llm = ChatMistralAI(
65
  api_key=MISTRAL_API_KEY,
66
  model="mistral-large-latest",
67
  temperature=0.1
68
  )
69
 
70
- # 8. Cadena RAG final
71
  rag_chain = RetrievalQA.from_chain_type(
72
  llm=llm,
73
- retriever=vector_store.as_retriever(search_kwargs={"k": 2}),
74
  chain_type="stuff",
75
  return_source_documents=True
76
  )
 
1
+ # model.py actualizado
2
  from langchain.chains import RetrievalQA
3
  from langchain.vectorstores import Chroma
4
  from langchain.embeddings import HuggingFaceEmbeddings
 
8
  import os
9
  import shutil
10
 
11
+ # 1. Configuración esencial
12
  HF_TOKEN = os.getenv("HF_TOKEN")
13
  MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
14
+ CHROMA_DIR = "/home/user/app/chroma_db"
15
 
 
 
 
16
 
17
+ # 2. Limpieza inicial radical
18
+ def full_clean():
19
+ # Eliminar todo rastro previo
20
+ shutil.rmtree(CHROMA_DIR, ignore_errors=True)
21
+ shutil.rmtree("/home/user/.cache/huggingface/hub", ignore_errors=True)
22
+ os.makedirs(CHROMA_DIR, exist_ok=True)
23
 
 
 
 
 
 
 
24
 
25
+ full_clean()
 
 
 
26
 
27
 
28
+ # 3. Descargar y configurar ChromaDB
29
+ def setup_chroma():
30
+ # Descargar archivo original
31
+ chroma_src = hf_hub_download(
32
+ repo_id="VictorCarr02/Conversational-Agent-LawsEC",
33
+ repo_type="dataset",
34
+ filename="chroma.sqlite3",
35
+ token=HF_TOKEN
36
+ )
37
 
38
+ # Configurar estructura requerida por Chroma
39
+ tenant_dir = os.path.join(CHROMA_DIR, "chroma.sqlite3")
40
+ os.makedirs(os.path.dirname(tenant_dir), exist_ok=True)
41
+ shutil.copy(chroma_src, tenant_dir)
 
 
 
 
42
 
 
 
43
 
44
+ setup_chroma()
45
+
46
+ # 4. Conexión explícita a ChromaDB
47
+ chroma_client = chromadb.PersistentClient(
48
+ path=CHROMA_DIR,
49
+ tenant="default_tenant",
50
+ database="default_database"
51
+ )
52
+
53
+ # 5. Verificar/crear collection
54
+ try:
55
+ collection = chroma_client.get_collection("legal_docs")
56
+ except ValueError:
57
+ collection = chroma_client.create_collection("legal_docs")
58
 
59
+ # 6. Configurar embeddings
60
  embeddings = HuggingFaceEmbeddings(
61
  model_name="sentence-transformers/all-mpnet-base-v2",
62
  model_kwargs={"device": "cpu"}
63
  )
64
 
65
+ # 7. Inicializar Chroma LangChain
66
  vector_store = Chroma(
67
  client=chroma_client,
68
  collection_name="legal_docs",
69
  embedding_function=embeddings
70
  )
71
 
72
+ # 8. Configurar Mistral
73
  llm = ChatMistralAI(
74
  api_key=MISTRAL_API_KEY,
75
  model="mistral-large-latest",
76
  temperature=0.1
77
  )
78
 
79
+ # 9. Cadena RAG final
80
  rag_chain = RetrievalQA.from_chain_type(
81
  llm=llm,
82
+ retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
83
  chain_type="stuff",
84
  return_source_documents=True
85
  )
requirements.txt CHANGED
@@ -3,4 +3,5 @@ langchain
3
  chromadb
4
  huggingface_hub
5
  langchain_mistralai
6
- langchain-community
 
 
3
  chromadb
4
  huggingface_hub
5
  langchain_mistralai
6
+ langchain-community
7
+ sentence-transformers