codelion commited on
Commit
b84dd14
·
verified ·
1 Parent(s): f65663c

Update loaders/common.py

Browse files
Files changed (1) hide show
  1. loaders/common.py +90 -76
loaders/common.py CHANGED
@@ -6,86 +6,100 @@ from langchain.schema import Document
6
  import streamlit as st
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from stats import add_usage
9
- import re
10
-
11
- def clean_chat_text(text):
12
- """Clean chat export text to remove special characters and format consistently"""
13
- # Remove non-printable characters
14
- text = ''.join(char for char in text if char.isprintable())
15
-
16
- # Clean up WhatsApp-style timestamps and phone numbers
17
- text = re.sub(r'\[\d{1,2}/\d{1,2}/\d{2,4},\s+\d{1,2}:\d{1,2}:\d{1,2}\s+[AP]M\]', '', text)
18
- text = re.sub(r'‪\+\d{2,3}\s*\d{3,10}\s*\d{3,10}‬', '', text)
19
-
20
- # Remove joining messages
21
- text = re.sub(r'joined using this group\'s invite link', '', text)
22
-
23
- # Remove extra whitespace
24
- text = ' '.join(text.split())
25
-
26
- return text
27
 
28
  def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
29
- documents = []
30
- file_name = file.name
31
- file_size = file.size
32
- if st.secrets.self_hosted == "false":
33
- if file_size > 1000000:
34
- st.error("File size is too large. Please upload a file smaller than 1MB or self host.")
35
- return
36
-
37
- dateshort = time.strftime("%Y%m%d")
38
- with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
39
- tmp_file.write(file.getvalue())
40
- tmp_file.flush()
41
- loader = loader_class(tmp_file.name)
42
- documents = loader.load()
43
- file_sha1 = compute_sha1_from_file(tmp_file.name)
44
- os.remove(tmp_file.name)
45
-
46
- chunk_size = st.session_state['chunk_size']
47
- chunk_overlap = st.session_state['chunk_overlap']
48
- text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
49
-
50
- documents = text_splitter.split_documents(documents)
51
-
52
- # Clean the text content before creating metadata
53
- docs_with_metadata = [Document(page_content=clean_chat_text(doc.page_content),
54
- metadata={"file_sha1": file_sha1,
55
- "file_size": file_size,
56
- "file_name": file_name,
57
- "chunk_size": chunk_size,
58
- "chunk_overlap": chunk_overlap,
59
- "date": dateshort,
60
- "user": st.session_state["username"]})
61
- for doc in documents]
62
-
63
  try:
64
- # Add debug logging before vector store addition
65
- print(f"Attempting to add {len(docs_with_metadata)} documents")
66
- print(f"Sample cleaned content: {docs_with_metadata[0].page_content[:200] if docs_with_metadata else 'No documents'}")
67
 
68
- vector_store.add_documents(docs_with_metadata)
 
 
 
 
 
 
 
 
69
 
70
- if stats_db:
71
- add_usage(stats_db, "embedding", "file", metadata={"file_name": file_name,
72
- "file_type": file_suffix,
73
- "chunk_size": chunk_size,
74
- "chunk_overlap": chunk_overlap})
75
- except Exception as e:
76
- print(f"Error adding documents to vector store:")
77
- print(f"Exception: {str(e)}")
78
- print(f"Input details:")
79
- print(f"File name: {file_name}")
80
- print(f"File size: {file_size}")
81
- print(f"File SHA1: {file_sha1}")
82
- print(f"Number of documents: {len(docs_with_metadata)}")
83
- print(f"Chunk size: {chunk_size}")
84
- print(f"Chunk overlap: {chunk_overlap}")
85
- print(f"First document preview (truncated):")
86
- if docs_with_metadata:
87
- print(docs_with_metadata[0].page_content[:500])
88
 
89
- # Additional debug info for vector store
90
- print(f"Vector store type: {type(vector_store).__name__}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  raise
 
6
  import streamlit as st
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from stats import add_usage
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def process_file(vector_store, file, loader_class, file_suffix, stats_db=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  try:
12
+ print("=== Starting file processing ===")
13
+ print(f"Initial file details - Name: {file.name}, Size: {file.size}")
 
14
 
15
+ documents = []
16
+ file_name = file.name
17
+ file_size = file.size
18
+ if st.secrets.self_hosted == "false":
19
+ if file_size > 1000000:
20
+ st.error("File size is too large. Please upload a file smaller than 1MB or self host.")
21
+ return
22
+
23
+ dateshort = time.strftime("%Y%m%d")
24
 
25
+ # Debug loading
26
+ print("=== Document Loading ===")
27
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_suffix) as tmp_file:
28
+ tmp_file.write(file.getvalue())
29
+ tmp_file.flush()
30
+ print(f"Temporary file created: {tmp_file.name}")
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ loader = loader_class(tmp_file.name)
33
+ documents = loader.load()
34
+ print(f"Number of documents after loading: {len(documents)}")
35
+ print("First document content preview:")
36
+ if documents:
37
+ print(documents[0].page_content[:200])
38
+
39
+ file_sha1 = compute_sha1_from_file(tmp_file.name)
40
+ os.remove(tmp_file.name)
41
+
42
+ # Debug splitting
43
+ print("\n=== Document Splitting ===")
44
+ chunk_size = st.session_state['chunk_size']
45
+ chunk_overlap = st.session_state['chunk_overlap']
46
+ print(f"Splitting with chunk_size: {chunk_size}, overlap: {chunk_overlap}")
47
+
48
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
49
+ documents = text_splitter.split_documents(documents)
50
+ print(f"Number of documents after splitting: {len(documents)}")
51
+
52
+ # Debug metadata creation
53
+ print("\n=== Creating Documents with Metadata ===")
54
+ docs_with_metadata = []
55
+ for i, doc in enumerate(documents):
56
+ if isinstance(doc.page_content, str):
57
+ if "error" in doc.page_content.lower():
58
+ print(f"WARNING: Found potential error message in document {i}:")
59
+ print(doc.page_content[:200])
60
+ continue # Skip this document
61
+
62
+ new_doc = Document(
63
+ page_content=doc.page_content,
64
+ metadata={
65
+ "file_sha1": file_sha1,
66
+ "file_size": file_size,
67
+ "file_name": file_name,
68
+ "chunk_size": chunk_size,
69
+ "chunk_overlap": chunk_overlap,
70
+ "date": dateshort,
71
+ "user": st.session_state["username"]
72
+ }
73
+ )
74
+ docs_with_metadata.append(new_doc)
75
+ else:
76
+ print(f"WARNING: Document {i} has non-string content type: {type(doc.page_content)}")
77
+ print(f"Content: {str(doc.page_content)[:200]}")
78
+
79
+ print(f"Final number of documents to be added: {len(docs_with_metadata)}")
80
+
81
+ # Vector store addition
82
+ try:
83
+ vector_store.add_documents(docs_with_metadata)
84
+ if stats_db:
85
+ add_usage(stats_db, "embedding", "file", metadata={
86
+ "file_name": file_name,
87
+ "file_type": file_suffix,
88
+ "chunk_size": chunk_size,
89
+ "chunk_overlap": chunk_overlap
90
+ })
91
+ except Exception as e:
92
+ print(f"\n=== Vector Store Addition Error ===")
93
+ print(f"Exception: {str(e)}")
94
+ print(f"Input details:")
95
+ print(f"File name: {file_name}")
96
+ print(f"File size: {file_size}")
97
+ print(f"File SHA1: {file_sha1}")
98
+ print(f"Number of documents: {len(docs_with_metadata)}")
99
+ print(f"Vector store type: {type(vector_store).__name__}")
100
+ raise
101
+
102
+ except Exception as e:
103
+ print(f"\n=== General Processing Error ===")
104
+ print(f"Exception occurred during file processing: {str(e)}")
105
  raise