Spaces:
Runtime error
Runtime error
quantization, reduce chunksize
Browse files- client.py +14 -1
- preprocessing.py +4 -5
- scraper.py +1 -2
client.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
-
from qdrant_client import QdrantClient
|
| 5 |
|
| 6 |
load_dotenv()
|
| 7 |
|
|
@@ -24,6 +24,13 @@ class HybridClient:
|
|
| 24 |
collection_name=collection,
|
| 25 |
vectors_config=self.qdrant_client.get_fastembed_vector_params(),
|
| 26 |
sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
)
|
| 28 |
print(f"--- {collection} collection created")
|
| 29 |
return collection
|
|
@@ -33,6 +40,8 @@ class HybridClient:
|
|
| 33 |
documents = []
|
| 34 |
for chunk in chunks:
|
| 35 |
documents.append(chunk.pop("text"))
|
|
|
|
|
|
|
| 36 |
|
| 37 |
self.qdrant_client.add(
|
| 38 |
collection_name=collection,
|
|
@@ -52,3 +61,7 @@ class HybridClient:
|
|
| 52 |
# Select and return metadata
|
| 53 |
# metadata = [hit.metadata for hit in search_result]
|
| 54 |
return search_result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
+
from qdrant_client import QdrantClient, models
|
| 5 |
|
| 6 |
load_dotenv()
|
| 7 |
|
|
|
|
| 24 |
collection_name=collection,
|
| 25 |
vectors_config=self.qdrant_client.get_fastembed_vector_params(),
|
| 26 |
sparse_vectors_config=self.qdrant_client.get_fastembed_sparse_vector_params(),
|
| 27 |
+
quantization_config=models.ScalarQuantization(
|
| 28 |
+
scalar=models.ScalarQuantizationConfig(
|
| 29 |
+
type=models.ScalarType.INT8,
|
| 30 |
+
quantile=0.99,
|
| 31 |
+
always_ram=False,
|
| 32 |
+
),
|
| 33 |
+
),
|
| 34 |
)
|
| 35 |
print(f"--- {collection} collection created")
|
| 36 |
return collection
|
|
|
|
| 40 |
documents = []
|
| 41 |
for chunk in chunks:
|
| 42 |
documents.append(chunk.pop("text"))
|
| 43 |
+
chunk.pop("color")
|
| 44 |
+
chunk.pop("size")
|
| 45 |
|
| 46 |
self.qdrant_client.add(
|
| 47 |
collection_name=collection,
|
|
|
|
| 61 |
# Select and return metadata
|
| 62 |
# metadata = [hit.metadata for hit in search_result]
|
| 63 |
return search_result
|
| 64 |
+
|
| 65 |
+
def get_chapter_name(self, collection: str):
|
| 66 |
+
points = self.qdrant_client.retrieve(collection_name=collection, ids=[0])
|
| 67 |
+
return points[0]
|
preprocessing.py
CHANGED
|
@@ -10,14 +10,14 @@ def sort_text(chunks):
|
|
| 10 |
right_column = []
|
| 11 |
|
| 12 |
for chunk in chunks:
|
| 13 |
-
if chunk["
|
| 14 |
left_column.append(chunk)
|
| 15 |
else:
|
| 16 |
right_column.append(chunk)
|
| 17 |
|
| 18 |
# Sort the chunks within each column based on the y-coordinate
|
| 19 |
-
left_column = sorted(left_column, key=lambda item: item["
|
| 20 |
-
right_column = sorted(right_column, key=lambda item: item["
|
| 21 |
|
| 22 |
sorted_text = left_column + right_column
|
| 23 |
return sorted_text
|
|
@@ -75,8 +75,7 @@ def get_chunks(doc):
|
|
| 75 |
{
|
| 76 |
"text": clean_text(text.strip()),
|
| 77 |
"page": page_num,
|
| 78 |
-
"
|
| 79 |
-
"y": block["bbox"][1],
|
| 80 |
"color": majority_element(spans, "color"),
|
| 81 |
"size": majority_element(spans, "size"),
|
| 82 |
}
|
|
|
|
| 10 |
right_column = []
|
| 11 |
|
| 12 |
for chunk in chunks:
|
| 13 |
+
if chunk["coordinates"][0] < x_threshold:
|
| 14 |
left_column.append(chunk)
|
| 15 |
else:
|
| 16 |
right_column.append(chunk)
|
| 17 |
|
| 18 |
# Sort the chunks within each column based on the y-coordinate
|
| 19 |
+
left_column = sorted(left_column, key=lambda item: item["coordinates"][1])
|
| 20 |
+
right_column = sorted(right_column, key=lambda item: item["coordinates"][1])
|
| 21 |
|
| 22 |
sorted_text = left_column + right_column
|
| 23 |
return sorted_text
|
|
|
|
| 75 |
{
|
| 76 |
"text": clean_text(text.strip()),
|
| 77 |
"page": page_num,
|
| 78 |
+
"coordinates": [round(block["bbox"][0], 1), round(block["bbox"][1], 1)],
|
|
|
|
| 79 |
"color": majority_element(spans, "color"),
|
| 80 |
"size": majority_element(spans, "size"),
|
| 81 |
}
|
scraper.py
CHANGED
|
@@ -70,8 +70,7 @@ async def download(session: aiohttp.ClientSession, url: str, max_retries: int =
|
|
| 70 |
async def upload_book(grade, subject, chapters=None):
|
| 71 |
hclient = HybridClient()
|
| 72 |
|
| 73 |
-
book = await get_book(grade, subject)
|
| 74 |
-
print(type(book))
|
| 75 |
for collection, pdf in book.items():
|
| 76 |
print(collection)
|
| 77 |
chunks = index_pdf(pdf, buffer=True)
|
|
|
|
| 70 |
async def upload_book(grade, subject, chapters=None):
|
| 71 |
hclient = HybridClient()
|
| 72 |
|
| 73 |
+
book = await get_book(grade, subject, chapters)
|
|
|
|
| 74 |
for collection, pdf in book.items():
|
| 75 |
print(collection)
|
| 76 |
chunks = index_pdf(pdf, buffer=True)
|