Marcos Morales
commited on
Commit
·
c69ce57
1
Parent(s):
dd58f3d
modified: app.py
Browse files
app.py
CHANGED
@@ -1,26 +1,27 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
|
|
3 |
from pathlib import Path
|
4 |
from src.reader import read_file
|
5 |
from src.chunker import chunk_text
|
6 |
from src.embeddings import embed_texts
|
7 |
|
8 |
def run_pipeline(files, vertical, language):
|
9 |
-
|
10 |
for file_path in files:
|
11 |
meta, body = read_file(Path(file_path))
|
12 |
base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta}
|
13 |
chunks = chunk_text(body)
|
14 |
-
|
15 |
-
for i, (chunk, vec) in enumerate(zip(chunks,
|
16 |
-
|
17 |
"id": f"{Path(file_path).stem}-chunk-{i:04d}",
|
18 |
"vector": vec,
|
19 |
"metadata": {**base_meta, "chunk_index": i}
|
20 |
})
|
21 |
out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
|
22 |
with open(out_path, "w", encoding="utf-8") as f:
|
23 |
-
for r in
|
24 |
json.dump(r, f, ensure_ascii=False)
|
25 |
f.write("\n")
|
26 |
return out_path
|
|
|
1 |
import gradio as gr
|
2 |
+
import json
|
3 |
+
import uuid
|
4 |
from pathlib import Path
|
5 |
from src.reader import read_file
|
6 |
from src.chunker import chunk_text
|
7 |
from src.embeddings import embed_texts
|
8 |
|
9 |
def run_pipeline(files, vertical, language):
|
10 |
+
records = []
|
11 |
for file_path in files:
|
12 |
meta, body = read_file(Path(file_path))
|
13 |
base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta}
|
14 |
chunks = chunk_text(body)
|
15 |
+
vectors = embed_texts(chunks)
|
16 |
+
for i, (chunk, vec) in enumerate(zip(chunks, vectors), 1):
|
17 |
+
records.append({
|
18 |
"id": f"{Path(file_path).stem}-chunk-{i:04d}",
|
19 |
"vector": vec,
|
20 |
"metadata": {**base_meta, "chunk_index": i}
|
21 |
})
|
22 |
out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
|
23 |
with open(out_path, "w", encoding="utf-8") as f:
|
24 |
+
for r in records:
|
25 |
json.dump(r, f, ensure_ascii=False)
|
26 |
f.write("\n")
|
27 |
return out_path
|