Spaces:
Running
on
T4
Running
on
T4
Update auditqa/doc_process.py
Browse files- auditqa/doc_process.py +23 -0
auditqa/doc_process.py
CHANGED
|
@@ -50,4 +50,27 @@ def process_pdf():
|
|
| 50 |
doc.metadata["year"] = file[-4:]
|
| 51 |
|
| 52 |
all_documents[category].append(doc_processed)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
|
|
|
| 50 |
doc.metadata["year"] = file[-4:]
|
| 51 |
|
| 52 |
all_documents[category].append(doc_processed)
|
| 53 |
+
|
| 54 |
+
for key, docs_processed in all_documents.items():
|
| 55 |
+
docs_processed = [item for sublist in docs_processed for item in sublist]
|
| 56 |
+
all_documents[key] = docs_processed
|
| 57 |
+
|
| 58 |
+
embeddings = HuggingFaceEmbeddings(
|
| 59 |
+
model_kwargs = {'device': device},
|
| 60 |
+
encode_kwargs = {'normalize_embeddings': True},
|
| 61 |
+
model_name="BAAI/bge-small-en-v1.5"
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
qdrant_collections = {}
|
| 65 |
+
|
| 66 |
+
for file,value in all_documents.items():
|
| 67 |
+
print("emebddings for:",file)
|
| 68 |
+
qdrant_collections[file] = Qdrant.from_documents(
|
| 69 |
+
value,
|
| 70 |
+
embeddings,
|
| 71 |
+
location=":memory:",
|
| 72 |
+
collection_name=file,
|
| 73 |
+
)
|
| 74 |
+
print("done")
|
| 75 |
+
return qdrant_collections
|
| 76 |
|