Update utils.py
Browse files
utils.py
CHANGED
|
@@ -87,6 +87,7 @@ german_stopwords = set(stopwords.words('german'))
|
|
| 87 |
ANZAHL_DOCS = 5
|
| 88 |
# Konstanten für Datei-Upload
|
| 89 |
REPO_ID = "alexkueck/kkg_suche"
|
|
|
|
| 90 |
REPO_TYPE = "space"
|
| 91 |
|
| 92 |
###############################
|
|
@@ -330,15 +331,57 @@ def split_documents_with_id(docs, text_splitter):
|
|
| 330 |
splits.append(split_doc)
|
| 331 |
return splits
|
| 332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
########################################
|
| 334 |
#finally die Splits erzeugen und laden.....
|
| 335 |
def document_loading_splitting():
|
| 336 |
docs = []
|
| 337 |
print("Directory Loader neu............................")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
# kreiere einen DirectoryLoader für jeden file type
|
| 339 |
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
| 340 |
word_loader = create_directory_loader('.word', CHROMA_WORD)
|
| 341 |
-
|
| 342 |
# Load the files
|
| 343 |
pdf_documents = pdf_loader.load()
|
| 344 |
word_documents = word_loader.load()
|
|
@@ -477,7 +520,7 @@ def upload_file_to_huggingface(file_path, upload_path):
|
|
| 477 |
api.upload_file(
|
| 478 |
path_or_fileobj=file_path,
|
| 479 |
path_in_repo=upload_path,
|
| 480 |
-
repo_id=
|
| 481 |
repo_type=REPO_TYPE,
|
| 482 |
token=HF_WRITE
|
| 483 |
)
|
|
|
|
| 87 |
ANZAHL_DOCS = 5
|
| 88 |
# Konstanten für Datei-Upload
|
| 89 |
REPO_ID = "alexkueck/kkg_suche"
|
| 90 |
+
STORAGE_REPO_ID = "alexkueck/kkg_files"
|
| 91 |
REPO_TYPE = "space"
|
| 92 |
|
| 93 |
###############################
|
|
|
|
| 331 |
splits.append(split_doc)
|
| 332 |
return splits
|
| 333 |
|
| 334 |
+
#######################################
|
| 335 |
+
# Dokumente aus anderem Space laden
|
| 336 |
+
#######################################
|
| 337 |
+
#ein File aus dem Space mit der REPO_ID laden
|
| 338 |
+
def download_file_from_hf(file_name, save_path):
|
| 339 |
+
url = f"https://huggingface.co/{STORAGE_REPO_ID}/resolve/main/{file_name}"
|
| 340 |
+
response = requests.get(url)
|
| 341 |
+
response.raise_for_status() # Raise an error for bad status codes
|
| 342 |
+
with open(save_path, 'wb') as file:
|
| 343 |
+
file.write(response.content)
|
| 344 |
+
return save_path
|
| 345 |
+
|
| 346 |
+
#Liste aller Files in dem Space mit der Repo_id
|
| 347 |
+
def list_files_in_hf_repo(repo_id):
|
| 348 |
+
repo_info = api.list_repo_files(repo_id=repo_id)
|
| 349 |
+
return repo_info
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
|
| 353 |
########################################
|
| 354 |
#finally die Splits erzeugen und laden.....
|
| 355 |
def document_loading_splitting():
|
| 356 |
docs = []
|
| 357 |
print("Directory Loader neu............................")
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
# Verzeichnis für heruntergeladene Dateien
|
| 362 |
+
download_dir = "downloaded_files"
|
| 363 |
+
os.makedirs(download_dir, exist_ok=True)
|
| 364 |
+
|
| 365 |
+
# Dateien im Hugging Face Space auflisten
|
| 366 |
+
files_in_repo = list_files_in_hf_repo(STORAGE_REPO_ID)
|
| 367 |
+
|
| 368 |
+
# Dateien aus dem Hugging Face Space herunterladen
|
| 369 |
+
for file_name in files_in_repo:
|
| 370 |
+
if file_name.endswith('.pdf') or file_name.endswith('.docx'):
|
| 371 |
+
local_file_path = os.path.join(download_dir, os.path.basename(file_name))
|
| 372 |
+
download_file_from_hf(file_name, local_file_path)
|
| 373 |
+
|
| 374 |
+
# Erstellen von DirectoryLoader für jeden Dateityp
|
| 375 |
+
pdf_loader = create_directory_loader('.pdf', download_dir)
|
| 376 |
+
word_loader = create_directory_loader('.word', download_dir)
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
"""
|
| 381 |
# kreiere einen DirectoryLoader für jeden file type
|
| 382 |
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
| 383 |
word_loader = create_directory_loader('.word', CHROMA_WORD)
|
| 384 |
+
"""
|
| 385 |
# Load the files
|
| 386 |
pdf_documents = pdf_loader.load()
|
| 387 |
word_documents = word_loader.load()
|
|
|
|
| 520 |
api.upload_file(
|
| 521 |
path_or_fileobj=file_path,
|
| 522 |
path_in_repo=upload_path,
|
| 523 |
+
repo_id=STORAGE_REPO_ID,
|
| 524 |
repo_type=REPO_TYPE,
|
| 525 |
token=HF_WRITE
|
| 526 |
)
|