Marcos Morales commited on
Commit
c69ce57
·
1 Parent(s): dd58f3d

modified: app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -6
app.py CHANGED
@@ -1,26 +1,27 @@
1
  import gradio as gr
2
- import yaml, json, uuid, os
 
3
  from pathlib import Path
4
  from src.reader import read_file
5
  from src.chunker import chunk_text
6
  from src.embeddings import embed_texts
7
 
8
  def run_pipeline(files, vertical, language):
9
- recs = []
10
  for file_path in files:
11
  meta, body = read_file(Path(file_path))
12
  base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta}
13
  chunks = chunk_text(body)
14
- vecs = embed_texts(chunks)
15
- for i, (chunk, vec) in enumerate(zip(chunks, vecs), 1):
16
- recs.append({
17
  "id": f"{Path(file_path).stem}-chunk-{i:04d}",
18
  "vector": vec,
19
  "metadata": {**base_meta, "chunk_index": i}
20
  })
21
  out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
22
  with open(out_path, "w", encoding="utf-8") as f:
23
- for r in recs:
24
  json.dump(r, f, ensure_ascii=False)
25
  f.write("\n")
26
  return out_path
 
1
  import gradio as gr
2
+ import json
3
+ import uuid
4
  from pathlib import Path
5
  from src.reader import read_file
6
  from src.chunker import chunk_text
7
  from src.embeddings import embed_texts
8
 
9
  def run_pipeline(files, vertical, language):
10
+ records = []
11
  for file_path in files:
12
  meta, body = read_file(Path(file_path))
13
  base_meta = {"vertical": vertical, "language": language, "source": Path(file_path).name, **meta}
14
  chunks = chunk_text(body)
15
+ vectors = embed_texts(chunks)
16
+ for i, (chunk, vec) in enumerate(zip(chunks, vectors), 1):
17
+ records.append({
18
  "id": f"{Path(file_path).stem}-chunk-{i:04d}",
19
  "vector": vec,
20
  "metadata": {**base_meta, "chunk_index": i}
21
  })
22
  out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
23
  with open(out_path, "w", encoding="utf-8") as f:
24
+ for r in records:
25
  json.dump(r, f, ensure_ascii=False)
26
  f.write("\n")
27
  return out_path