Update utils.py
Browse files
utils.py
CHANGED
|
@@ -102,16 +102,6 @@ HF_WRITE = os.getenv("HF_WRITE")
|
|
| 102 |
# HfApi-Instanz erstellen
|
| 103 |
api = HfApi()
|
| 104 |
|
| 105 |
-
# Überprüfen, ob das Repository existiert und zugänglich ist
|
| 106 |
-
try:
|
| 107 |
-
repo_info = api.list_repo_files(repo_id=STORAGE_REPO_ID, repo_type=REPO_TYPE, token=hf_token)
|
| 108 |
-
print(f"Repository '{STORAGE_REPO_ID}' enthält folgende Dateien: {repo_info}")
|
| 109 |
-
except Exception as e:
|
| 110 |
-
print(f"Fehler beim Zugriff auf das Repository: {e}")
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
|
| 116 |
#Maoing für die Splits (orginal und Preprocessed
|
| 117 |
split_to_original_mapping = []
|
|
@@ -260,10 +250,7 @@ def clean_text(text):
|
|
| 260 |
##################################################
|
| 261 |
#RAG Hilfsfunktionen - Dokumenten bearbeiten für Vektorstore
|
| 262 |
##################################################
|
| 263 |
-
|
| 264 |
-
# Directory Loader Konfigurieren um Text zu extrahieren
|
| 265 |
-
##################################################
|
| 266 |
-
|
| 267 |
def access_pdf(self, filename):
|
| 268 |
# Temporäre Datei erstellen
|
| 269 |
with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
|
|
@@ -280,35 +267,8 @@ def access_pdf(self, filename):
|
|
| 280 |
|
| 281 |
return temp_path
|
| 282 |
|
| 283 |
-
|
| 284 |
-
#besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
|
| 285 |
-
def create_custom_loader(file_type, file_list): #create_directory_loader(file_type, directory_path):
|
| 286 |
-
loaders = {
|
| 287 |
-
'.pdf': load_pdf_with_metadata,
|
| 288 |
-
'.word': load_word_with_metadata,
|
| 289 |
-
}
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
class CustomLoader:
|
| 293 |
-
|
| 294 |
-
def __init__(self, directory_path, file_type, loader_func):
|
| 295 |
-
self.directory_path = directory_path
|
| 296 |
-
self.file_type = file_type
|
| 297 |
-
self.loader_func = loader_func
|
| 298 |
-
|
| 299 |
-
def load(self):
|
| 300 |
-
documents = []
|
| 301 |
-
for root, _, files in os.walk(self.directory_path):
|
| 302 |
-
for file in files:
|
| 303 |
-
if file.endswith(self.file_type):
|
| 304 |
-
file_path = os.path.join(root, file)
|
| 305 |
-
documents.extend(self.loader_func(file_path))
|
| 306 |
-
return documents
|
| 307 |
-
|
| 308 |
-
return CustomLoader(directory_path, file_type, loaders[file_type])
|
| 309 |
-
"""
|
| 310 |
-
|
| 311 |
-
|
| 312 |
def create_custom_loader(file_type, file_list):
|
| 313 |
loaders = {
|
| 314 |
'.pdf': load_pdf_with_metadata,
|
|
@@ -330,7 +290,7 @@ def load_pdf_with_metadata(file_path):
|
|
| 330 |
documents.append(Document(content=content, title=title, page=page_number, path=file_path, split_id=None))
|
| 331 |
return documents
|
| 332 |
|
| 333 |
-
#für
|
| 334 |
def load_word_with_metadata(file_path):
|
| 335 |
document = docx.Document(file_path)
|
| 336 |
title = "Dokument"
|
|
@@ -345,22 +305,11 @@ def load_word_with_metadata(file_path):
|
|
| 345 |
|
| 346 |
|
| 347 |
################################################
|
| 348 |
-
#Vektorstore
|
| 349 |
################################################
|
| 350 |
-
|
| 351 |
-
"""
|
| 352 |
-
def list_files_in_hf_repo(repo_id, directory=""):
|
| 353 |
-
try:
|
| 354 |
-
repo_info = api.list_repo_files(repo_id=repo_id, repo_type=REPO_TYPE)
|
| 355 |
-
if directory:
|
| 356 |
-
repo_info = [file for file in repo_info if file.startswith(directory)]
|
| 357 |
-
return repo_info
|
| 358 |
-
except Exception as e:
|
| 359 |
-
print(f"Fehler beim Zugriff auf das Repository.........................:{repo_id} {e}")
|
| 360 |
-
return []
|
| 361 |
-
"""
|
| 362 |
################################################
|
| 363 |
-
# Document Splitting
|
| 364 |
################################################
|
| 365 |
#die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
|
| 366 |
# Funktion zum Splitten und Zuweisen der doc_id
|
|
@@ -382,34 +331,11 @@ def split_documents_with_id(docs, text_splitter):
|
|
| 382 |
|
| 383 |
|
| 384 |
########################################
|
| 385 |
-
#finally die Splits erzeugen und laden.....
|
| 386 |
def document_loading_splitting():
|
| 387 |
docs = []
|
| 388 |
print("Directory Loader neu............................")
|
| 389 |
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
# Verzeichnis für heruntergeladene Dateien
|
| 393 |
-
#download_dir = CHROMA_PDF
|
| 394 |
-
#os.makedirs(download_dir, exist_ok=True)
|
| 395 |
-
|
| 396 |
-
# Dateien im Hugging Face Space auflisten
|
| 397 |
-
"""
|
| 398 |
-
files_in_repo = list_files_in_hf_repo(STORAGE_REPO_ID, "chroma/kkg/pdf/")
|
| 399 |
-
print("hier.....................................")
|
| 400 |
-
# Dateien aus dem Hugging Face Space mit der STORAGE_REPO_ID herunterladen
|
| 401 |
-
for file_name in files_in_repo:
|
| 402 |
-
if file_name.endswith('.pdf'):
|
| 403 |
-
local_file_path = os.path.join(CHROMA_PDF, os.path.basename(file_name))
|
| 404 |
-
download_file_from_hf(file_name, local_file_path)
|
| 405 |
-
if file_name.endswith('.docx'):
|
| 406 |
-
local_file_path = os.path.join(CHROMA_WORD, os.path.basename(file_name))
|
| 407 |
-
download_file_from_hf(file_name, local_file_path)
|
| 408 |
-
print("file_name..................."+str(file_name))
|
| 409 |
-
print("local_file_path..................."+str(local_file_path))
|
| 410 |
-
"""
|
| 411 |
-
|
| 412 |
-
|
| 413 |
# Dateien im Hugging Face Space auflisten
|
| 414 |
files_in_repo = list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space", token=hf_token)
|
| 415 |
pdf_files = [f for f in files_in_repo if f.endswith('.pdf') and f.startswith("chroma/kkg/pdf/")]
|
|
@@ -417,19 +343,9 @@ def document_loading_splitting():
|
|
| 417 |
|
| 418 |
|
| 419 |
# Erstellen von DirectoryLoader für jeden Dateityp
|
| 420 |
-
# pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
| 421 |
-
#word_loader = create_directory_loader('.word', CHROMA_WORD)
|
| 422 |
-
|
| 423 |
pdf_loader = create_custom_loader('.pdf', pdf_files)
|
| 424 |
word_loader = create_custom_loader('.docx', word_files)
|
| 425 |
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
"""
|
| 429 |
-
# kreiere einen DirectoryLoader für jeden file type
|
| 430 |
-
pdf_loader = create_directory_loader('.pdf', CHROMA_PDF)
|
| 431 |
-
word_loader = create_directory_loader('.word', CHROMA_WORD)
|
| 432 |
-
"""
|
| 433 |
# Load the files
|
| 434 |
pdf_documents = pdf_loader.load()
|
| 435 |
word_documents = word_loader.load()
|
|
@@ -489,6 +405,7 @@ def document_storage_chroma(splits):
|
|
| 489 |
|
| 490 |
########################################################
|
| 491 |
#Splits für den Vektorstore speichern - bzw. laden
|
|
|
|
| 492 |
def save_splits(preprocessed_splits, original_splits, directory="chroma/kkg", preprocessed_filename="preprocessed_splits.pkl", original_filename="original_splits.pkl"):
|
| 493 |
# Erstellen des Verzeichnisses, falls es nicht existiert
|
| 494 |
if not os.path.exists(directory):
|
|
@@ -507,23 +424,7 @@ def save_splits(preprocessed_splits, original_splits, directory="chroma/kkg", pr
|
|
| 507 |
# Hochladen der Splits-Dateien zum Hugging Face Space
|
| 508 |
upload_file_to_huggingface(preprocessed_filepath, f"{directory}/{preprocessed_filename}")
|
| 509 |
upload_file_to_huggingface(original_filepath, f"{directory}/{original_filename}")
|
| 510 |
-
|
| 511 |
-
def load_splits(directory="chroma/kkg", preprocessed_filename="preprocessed_splits.pkl", original_filename="original_splits.pkl"):
|
| 512 |
-
# Vollständigen Pfad zur Datei erstellen
|
| 513 |
-
preprocessed_filepath = os.path.join(directory, preprocessed_filename)
|
| 514 |
-
original_filepath = os.path.join(directory, original_filename)
|
| 515 |
-
|
| 516 |
-
# Laden der vorverarbeiteten Splits aus der Datei
|
| 517 |
-
if os.path.exists(preprocessed_filepath) and os.path.exists(original_filepath):
|
| 518 |
-
with open(preprocessed_filepath, "rb") as f:
|
| 519 |
-
preprocessed_splits = pickle.load(f)
|
| 520 |
-
|
| 521 |
-
with open(original_filepath, "rb") as f:
|
| 522 |
-
original_splits = pickle.load(f)
|
| 523 |
-
|
| 524 |
-
return preprocessed_splits, original_splits
|
| 525 |
-
return None, None
|
| 526 |
-
"""
|
| 527 |
def load_splits(directory="chroma/kkg", preprocessed_filename="preprocessed_splits.pkl", original_filename="original_splits.pkl"):
|
| 528 |
preprocessed_splits = None
|
| 529 |
original_splits = None
|
|
@@ -554,20 +455,11 @@ def load_splits(directory="chroma/kkg", preprocessed_filename="preprocessed_spli
|
|
| 554 |
|
| 555 |
return preprocessed_splits, original_splits
|
| 556 |
|
| 557 |
-
########################################################
|
| 558 |
-
#Vektorstore speichern - bzw. laden
|
| 559 |
-
#Laden des Vektorstores - aus den gespeicherten splits
|
| 560 |
-
"""
|
| 561 |
-
def load_vectorstore():
|
| 562 |
-
splits_and_metadata = load_splits_and_metadata()
|
| 563 |
-
if splits_and_metadata is not None:
|
| 564 |
-
PREPROCESSED_SPLITS, SPLIT_TO_ORIGINAL_MAPPING = splits_and_metadata
|
| 565 |
-
return document_storage_chroma(PREPROCESSED_SPLITS)
|
| 566 |
-
return None
|
| 567 |
-
"""
|
| 568 |
|
| 569 |
-
|
| 570 |
-
|
|
|
|
|
|
|
| 571 |
def save_split_to_original_mapping(mapping, directory="chroma/kkg", filename="mapping.pkl"):
|
| 572 |
# Erstellen des Verzeichnisses, falls es nicht existiert
|
| 573 |
if not os.path.exists(directory):
|
|
@@ -580,17 +472,7 @@ def save_split_to_original_mapping(mapping, directory="chroma/kkg", filename="ma
|
|
| 580 |
|
| 581 |
# Hochladen der Mapping-Datei zum Hugging Face Space
|
| 582 |
upload_file_to_huggingface(filepath, f"{directory}/{filename}")
|
| 583 |
-
|
| 584 |
-
def load_split_to_original_mapping(directory="chroma/kkg", filename="mapping.pkl"):
|
| 585 |
-
# Vollständigen Pfad zur Datei erstellen
|
| 586 |
-
filepath = os.path.join(directory, filename)
|
| 587 |
-
|
| 588 |
-
# Laden des Mappings aus der Datei
|
| 589 |
-
if os.path.exists(filepath):
|
| 590 |
-
with open(filepath, "rb") as f:
|
| 591 |
-
return pickle.load(f)
|
| 592 |
-
return None
|
| 593 |
-
"""
|
| 594 |
|
| 595 |
def load_split_to_original_mapping(directory="chroma/kkg", filename="mapping.pkl"):
|
| 596 |
try:
|
|
@@ -624,7 +506,7 @@ def upload_file_to_huggingface(file_path, upload_path):
|
|
| 624 |
)
|
| 625 |
|
| 626 |
|
| 627 |
-
#ein File aus dem Space mit der REPO_ID laden
|
| 628 |
def download_file_from_hf(file_name, save_path):
|
| 629 |
url = f"https://huggingface.co/{STORAGE_REPO_ID}/resolve/main/{file_name}"
|
| 630 |
response = requests.get(url, headers=HEADERS)
|
|
@@ -776,7 +658,7 @@ def rag_chain_simpel( prompt, retriever):
|
|
| 776 |
# keine relevanten Dokumente gefunden
|
| 777 |
result = {
|
| 778 |
"answer": "Keine relevanten Dokumente gefunden",
|
| 779 |
-
"relevant_docs":
|
| 780 |
}
|
| 781 |
|
| 782 |
return result
|
|
@@ -793,16 +675,7 @@ def extract_document_info(documents):
|
|
| 793 |
title = filename if filename else "Keine Überschrift"
|
| 794 |
doc_path = doc.metadata.get("path", "")
|
| 795 |
# Determine the document type and adjust the path accordingly
|
| 796 |
-
d_link = download_link(doc)
|
| 797 |
-
"""
|
| 798 |
-
doc_path = doc.metadata.get("path", "")
|
| 799 |
-
if doc_path.endswith('.pdf'):
|
| 800 |
-
download_link = download_link(doc) #f"https://huggingface.co/spaces/alexkueck/{STORAGE_REPO_ID}/resolve/main/chroma/kkg/pdf/{title}"
|
| 801 |
-
elif doc_path.endswith('.docx'):
|
| 802 |
-
download_link = f"https://huggingface.co/spaces/alexkueck/{STORAGE_REPO_ID}/resolve/main/chroma/kkg/word/{title}"
|
| 803 |
-
else:
|
| 804 |
-
download_link = doc_path
|
| 805 |
-
"""
|
| 806 |
|
| 807 |
info = {
|
| 808 |
'content': doc.page_content,
|
|
@@ -838,16 +711,6 @@ def generate_prompt_with_history(text, history, max_length=4048):
|
|
| 838 |
return None
|
| 839 |
|
| 840 |
|
| 841 |
-
#############################################
|
| 842 |
-
#Prompt und History für Hugging Face Schnittstelle
|
| 843 |
-
def generate_prompt_with_history_hf(prompt, history):
|
| 844 |
-
history_transformer_format = history + [[prompt, ""]]
|
| 845 |
-
#stop = StopOnTokens()
|
| 846 |
-
|
| 847 |
-
messages = "".join(["".join(["\n<human>:"+item[0], "\n<bot>:"+item[1]]) #curr_system_message +
|
| 848 |
-
for item in history_transformer_format])
|
| 849 |
-
|
| 850 |
-
|
| 851 |
|
| 852 |
##########################################
|
| 853 |
#Hashing.... Für die Validierung........
|
|
@@ -869,20 +732,7 @@ def transfer_input(inputs):
|
|
| 869 |
|
| 870 |
|
| 871 |
########################################################
|
| 872 |
-
######## Hilfsfunktionen Datei-
|
| 873 |
-
"""
|
| 874 |
-
def download_link(doc):
|
| 875 |
-
# URL für das Herunterladen der Datei
|
| 876 |
-
# Check if doc is a dictionary and contains the key 'pfad'
|
| 877 |
-
if isinstance(doc, dict) and 'pfad' in doc:
|
| 878 |
-
file_url = f"https://huggingface.co/spaces/alexkueck/{STORAGE_REPO_ID}/resolve/main/chroma/kkg/{doc['pfad']}?token=hf_token"
|
| 879 |
-
return f'<b><a href="{file_url}" target="_blank" style="color: #BB70FC; font-weight: bold;">{doc["titel"]}</a></b>'
|
| 880 |
-
else:
|
| 881 |
-
file_url = f"https://huggingface.co/spaces/alexkueck/{STORAGE_REPO_ID}/resolve/main/{doc}?token=hf_token"
|
| 882 |
-
return f'<b><a href="{file_url}" target="_blank" style="color: #BB70FC; font-weight: bold;">{doc}</a></b>'
|
| 883 |
-
|
| 884 |
-
"""
|
| 885 |
-
|
| 886 |
def download_link(doc):
|
| 887 |
# Basis-URL für das Hugging Face Repository
|
| 888 |
base_url = f"https://huggingface.co/spaces/{STORAGE_REPO_ID}/resolve/main"
|
|
@@ -917,32 +767,6 @@ def download_link(doc):
|
|
| 917 |
#################################################
|
| 918 |
#File Liste beim Tab für File-Upload schön darstellen
|
| 919 |
#################################################
|
| 920 |
-
"""
|
| 921 |
-
def display_files():
|
| 922 |
-
files = os.listdir(DOCS_DIR_PDF)
|
| 923 |
-
files_table = "<table style='width:100%; border-collapse: collapse;'>"
|
| 924 |
-
files_table += "<tr style='background-color: #930BBA; color: white; font-weight: bold; font-size: larger;'><th>Dateiname - PDF-Ordner</th></tr>"
|
| 925 |
-
for i, file in enumerate(files):
|
| 926 |
-
file_path = os.path.join(DOCS_DIR_PDF, file)
|
| 927 |
-
file_size = os.path.getsize(file_path) / 1024 # Größe in KB
|
| 928 |
-
row_color = "#4f4f4f" if i % 2 == 0 else "#3a3a3a" # Wechselnde Zeilenfarben
|
| 929 |
-
files_table += f"<tr style='background-color: {row_color}; border-bottom: 1px solid #ddd;'>"
|
| 930 |
-
files_table += f"<td><b>{download_link(file)}</b></td></tr>"
|
| 931 |
-
files_table += "</table>"
|
| 932 |
-
|
| 933 |
-
files = os.listdir(DOCS_DIR_WORD)
|
| 934 |
-
files_table += "<table style='width:100%; border-collapse: collapse;'>"
|
| 935 |
-
files_table += "<tr style='background-color: #930BBA; color: white; font-weight: bold; font-size: larger;'><th>Dateiname - Word-Ordner</th></tr>"
|
| 936 |
-
for i, file in enumerate(files):
|
| 937 |
-
file_path = os.path.join(DOCS_DIR_WORD, file)
|
| 938 |
-
file_size = os.path.getsize(file_path) / 1024 # Größe in KB
|
| 939 |
-
row_color = "#4f4f4f" if i % 2 == 0 else "#3a3a3a" # Wechselnde Zeilenfarben
|
| 940 |
-
files_table += f"<tr style='background-color: {row_color}; border-bottom: 1px solid #ddd;'>"
|
| 941 |
-
files_table += f"<td><b>{download_link(file)}</b></td></tr>"
|
| 942 |
-
files_table += "</table>"
|
| 943 |
-
return files_table
|
| 944 |
-
"""
|
| 945 |
-
|
| 946 |
def display_files():
|
| 947 |
files_table = "<table style='width:100%; border-collapse: collapse;'>"
|
| 948 |
|
|
@@ -966,13 +790,6 @@ def display_files():
|
|
| 966 |
return files_table
|
| 967 |
|
| 968 |
|
| 969 |
-
# gefundene relevante Dokumente auflisten (links)
|
| 970 |
-
"""
|
| 971 |
-
def list_pdfs():
|
| 972 |
-
if not os.path.exists(DOCS_DIR):
|
| 973 |
-
return []
|
| 974 |
-
return [f for f in os.listdir(SAVE_DIR) if f.endswith('.pdf')]
|
| 975 |
-
"""
|
| 976 |
##########################################
|
| 977 |
#Extension des hochgeladenen Files bestimmen
|
| 978 |
def analyze_file(file):
|
|
@@ -1012,8 +829,9 @@ class State:
|
|
| 1012 |
self.interrupted = False
|
| 1013 |
shared_state = State()
|
| 1014 |
|
| 1015 |
-
|
| 1016 |
#Für die relevanten Dokumente - damit sie passend zum Dictionary die Attribute haben
|
|
|
|
| 1017 |
class Document:
|
| 1018 |
def __init__(self, content, title, page, path, split_id=None):
|
| 1019 |
self.page_content = content
|
|
|
|
| 102 |
# HfApi-Instanz erstellen
|
| 103 |
api = HfApi()
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
#Maoing für die Splits (orginal und Preprocessed
|
| 107 |
split_to_original_mapping = []
|
|
|
|
| 250 |
##################################################
|
| 251 |
#RAG Hilfsfunktionen - Dokumenten bearbeiten für Vektorstore
|
| 252 |
##################################################
|
| 253 |
+
#Files aus anderem Repi downloaden
|
|
|
|
|
|
|
|
|
|
| 254 |
def access_pdf(self, filename):
|
| 255 |
# Temporäre Datei erstellen
|
| 256 |
with tempfile.NamedTemporaryFile(delete=False, suffix=self.file_type) as temp_file:
|
|
|
|
| 267 |
|
| 268 |
return temp_path
|
| 269 |
|
| 270 |
+
################################################
|
| 271 |
+
#besseren directory Loader als CustomLoader definieren, der den inhalt des dokuemnts, die seitenzahlen, die überschriften und die pfadezu den dokumenten extrahieren
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
def create_custom_loader(file_type, file_list):
|
| 273 |
loaders = {
|
| 274 |
'.pdf': load_pdf_with_metadata,
|
|
|
|
| 290 |
documents.append(Document(content=content, title=title, page=page_number, path=file_path, split_id=None))
|
| 291 |
return documents
|
| 292 |
|
| 293 |
+
#für Word Dokumente
|
| 294 |
def load_word_with_metadata(file_path):
|
| 295 |
document = docx.Document(file_path)
|
| 296 |
title = "Dokument"
|
|
|
|
| 305 |
|
| 306 |
|
| 307 |
################################################
|
| 308 |
+
#für den Vektorstore
|
| 309 |
################################################
|
| 310 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
################################################
|
| 312 |
+
# Document Splitting - und id für das Mapping
|
| 313 |
################################################
|
| 314 |
#die Inhalte splitten, um in Vektordatenbank entsprechend zu laden als Splits
|
| 315 |
# Funktion zum Splitten und Zuweisen der doc_id
|
|
|
|
| 331 |
|
| 332 |
|
| 333 |
########################################
|
| 334 |
+
#finally die Splits erzeugen und laden..... für den Vektorstore
|
| 335 |
def document_loading_splitting():
|
| 336 |
docs = []
|
| 337 |
print("Directory Loader neu............................")
|
| 338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
# Dateien im Hugging Face Space auflisten
|
| 340 |
files_in_repo = list_repo_files(repo_id=STORAGE_REPO_ID, repo_type="space", token=hf_token)
|
| 341 |
pdf_files = [f for f in files_in_repo if f.endswith('.pdf') and f.startswith("chroma/kkg/pdf/")]
|
|
|
|
| 343 |
|
| 344 |
|
| 345 |
# Erstellen von DirectoryLoader für jeden Dateityp
|
|
|
|
|
|
|
|
|
|
| 346 |
pdf_loader = create_custom_loader('.pdf', pdf_files)
|
| 347 |
word_loader = create_custom_loader('.docx', word_files)
|
| 348 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
# Load the files
|
| 350 |
pdf_documents = pdf_loader.load()
|
| 351 |
word_documents = word_loader.load()
|
|
|
|
| 405 |
|
| 406 |
########################################################
|
| 407 |
#Splits für den Vektorstore speichern - bzw. laden
|
| 408 |
+
########################################################
|
| 409 |
def save_splits(preprocessed_splits, original_splits, directory="chroma/kkg", preprocessed_filename="preprocessed_splits.pkl", original_filename="original_splits.pkl"):
|
| 410 |
# Erstellen des Verzeichnisses, falls es nicht existiert
|
| 411 |
if not os.path.exists(directory):
|
|
|
|
| 424 |
# Hochladen der Splits-Dateien zum Hugging Face Space
|
| 425 |
upload_file_to_huggingface(preprocessed_filepath, f"{directory}/{preprocessed_filename}")
|
| 426 |
upload_file_to_huggingface(original_filepath, f"{directory}/{original_filename}")
|
| 427 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
def load_splits(directory="chroma/kkg", preprocessed_filename="preprocessed_splits.pkl", original_filename="original_splits.pkl"):
|
| 429 |
preprocessed_splits = None
|
| 430 |
original_splits = None
|
|
|
|
| 455 |
|
| 456 |
return preprocessed_splits, original_splits
|
| 457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
|
| 459 |
+
|
| 460 |
+
########################################
|
| 461 |
+
#das Mapping der orginal-Splits und der preprocessed Splits speichern - und laden
|
| 462 |
+
########################################
|
| 463 |
def save_split_to_original_mapping(mapping, directory="chroma/kkg", filename="mapping.pkl"):
|
| 464 |
# Erstellen des Verzeichnisses, falls es nicht existiert
|
| 465 |
if not os.path.exists(directory):
|
|
|
|
| 472 |
|
| 473 |
# Hochladen der Mapping-Datei zum Hugging Face Space
|
| 474 |
upload_file_to_huggingface(filepath, f"{directory}/{filename}")
|
| 475 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
|
| 477 |
def load_split_to_original_mapping(directory="chroma/kkg", filename="mapping.pkl"):
|
| 478 |
try:
|
|
|
|
| 506 |
)
|
| 507 |
|
| 508 |
|
| 509 |
+
#ein File aus dem Space mit der REPO_ID laden - Authentifizierung über den HEADER
|
| 510 |
def download_file_from_hf(file_name, save_path):
|
| 511 |
url = f"https://huggingface.co/{STORAGE_REPO_ID}/resolve/main/{file_name}"
|
| 512 |
response = requests.get(url, headers=HEADERS)
|
|
|
|
| 658 |
# keine relevanten Dokumente gefunden
|
| 659 |
result = {
|
| 660 |
"answer": "Keine relevanten Dokumente gefunden",
|
| 661 |
+
"relevant_docs": None
|
| 662 |
}
|
| 663 |
|
| 664 |
return result
|
|
|
|
| 675 |
title = filename if filename else "Keine Überschrift"
|
| 676 |
doc_path = doc.metadata.get("path", "")
|
| 677 |
# Determine the document type and adjust the path accordingly
|
| 678 |
+
d_link = download_link(doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 679 |
|
| 680 |
info = {
|
| 681 |
'content': doc.page_content,
|
|
|
|
| 711 |
return None
|
| 712 |
|
| 713 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 714 |
|
| 715 |
##########################################
|
| 716 |
#Hashing.... Für die Validierung........
|
|
|
|
| 732 |
|
| 733 |
|
| 734 |
########################################################
|
| 735 |
+
######## Hilfsfunktionen Datei-Download ##################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 736 |
def download_link(doc):
|
| 737 |
# Basis-URL für das Hugging Face Repository
|
| 738 |
base_url = f"https://huggingface.co/spaces/{STORAGE_REPO_ID}/resolve/main"
|
|
|
|
| 767 |
#################################################
|
| 768 |
#File Liste beim Tab für File-Upload schön darstellen
|
| 769 |
#################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 770 |
def display_files():
|
| 771 |
files_table = "<table style='width:100%; border-collapse: collapse;'>"
|
| 772 |
|
|
|
|
| 790 |
return files_table
|
| 791 |
|
| 792 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 793 |
##########################################
|
| 794 |
#Extension des hochgeladenen Files bestimmen
|
| 795 |
def analyze_file(file):
|
|
|
|
| 829 |
self.interrupted = False
|
| 830 |
shared_state = State()
|
| 831 |
|
| 832 |
+
###############################################
|
| 833 |
#Für die relevanten Dokumente - damit sie passend zum Dictionary die Attribute haben
|
| 834 |
+
###############################################
|
| 835 |
class Document:
|
| 836 |
def __init__(self, content, title, page, path, split_id=None):
|
| 837 |
self.page_content = content
|