Spaces:
Build error
Build error
fikird
commited on
Commit
Β·
8e83c5f
1
Parent(s):
f424b55
Enhance content processing with better summarization and topic extraction
Browse files- app.py +41 -36
- search_engine.py +91 -73
app.py
CHANGED
@@ -4,7 +4,6 @@ import torch
|
|
4 |
import os
|
5 |
import logging
|
6 |
import traceback
|
7 |
-
import textwrap
|
8 |
|
9 |
# Configure logging
|
10 |
logging.basicConfig(
|
@@ -29,48 +28,54 @@ def safe_search(query, max_results):
|
|
29 |
return f"# β Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
|
30 |
|
31 |
def format_results(results):
|
32 |
-
"""Format search results
|
33 |
-
if
|
34 |
-
return
|
35 |
-
|
36 |
-
|
37 |
|
38 |
# Add insights section
|
39 |
-
if 'insights' in results
|
40 |
-
|
41 |
-
output.append("## π‘ Key Insights")
|
42 |
-
output.append(results['insights'])
|
43 |
-
output.append("")
|
44 |
|
45 |
-
# Add
|
46 |
-
if '
|
47 |
-
|
48 |
-
for
|
49 |
-
|
50 |
-
|
|
|
51 |
|
52 |
-
# Add
|
53 |
-
if 'results' in results
|
54 |
-
|
55 |
for i, result in enumerate(results['results'], 1):
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
# Wrap summary text for better readability
|
61 |
-
wrapped_summary = textwrap.fill(result['summary'], width=80)
|
62 |
-
output.append(f"\n{wrapped_summary}")
|
63 |
if 'url' in result:
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
-
return
|
74 |
|
75 |
def create_demo():
|
76 |
"""Create the Gradio interface"""
|
|
|
4 |
import os
|
5 |
import logging
|
6 |
import traceback
|
|
|
7 |
|
8 |
# Configure logging
|
9 |
logging.basicConfig(
|
|
|
28 |
return f"# β Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
|
29 |
|
30 |
def format_results(results):
|
31 |
+
"""Format search results for display"""
|
32 |
+
if not results or not results.get('results'):
|
33 |
+
return "# β οΈ No Results\nNo search results were found. Please try a different query."
|
34 |
+
|
35 |
+
formatted = f"# π Search Results\n\n"
|
36 |
|
37 |
# Add insights section
|
38 |
+
if 'insights' in results:
|
39 |
+
formatted += f"## π‘ Key Insights\n{results['insights']}\n\n"
|
|
|
|
|
|
|
40 |
|
41 |
+
# Add follow-up questions
|
42 |
+
if 'follow_up_questions' in results:
|
43 |
+
formatted += "## β Follow-up Questions\n"
|
44 |
+
for q in results['follow_up_questions']:
|
45 |
+
if q and q.strip():
|
46 |
+
formatted += f"- {q.strip()}\n"
|
47 |
+
formatted += "\n"
|
48 |
|
49 |
+
# Add main results
|
50 |
+
if 'results' in results:
|
51 |
+
formatted += "## π Detailed Results\n\n"
|
52 |
for i, result in enumerate(results['results'], 1):
|
53 |
+
if not isinstance(result, dict):
|
54 |
+
continue
|
55 |
+
|
56 |
+
formatted += f"### {i}. "
|
|
|
|
|
|
|
57 |
if 'url' in result:
|
58 |
+
title = result.get('title', 'Untitled')
|
59 |
+
formatted += f"[{title}]({result['url']})\n"
|
60 |
+
if 'summary' in result:
|
61 |
+
formatted += f"\n{result['summary']}\n\n"
|
62 |
+
|
63 |
+
# Add similar chunks if available
|
64 |
+
if 'similar_chunks' in results:
|
65 |
+
formatted += "## π Related Content\n\n"
|
66 |
+
for i, chunk in enumerate(results['similar_chunks'], 1):
|
67 |
+
if not isinstance(chunk, dict):
|
68 |
+
continue
|
69 |
+
|
70 |
+
formatted += f"### Related {i}\n"
|
71 |
+
if 'metadata' in chunk:
|
72 |
+
meta = chunk['metadata']
|
73 |
+
if 'title' in meta and 'url' in meta:
|
74 |
+
formatted += f"From [{meta['title']}]({meta['url']})\n"
|
75 |
+
if 'content' in chunk:
|
76 |
+
formatted += f"\n{chunk['content'][:200]}...\n\n"
|
77 |
|
78 |
+
return formatted
|
79 |
|
80 |
def create_demo():
|
81 |
"""Create the Gradio interface"""
|
search_engine.py
CHANGED
@@ -47,98 +47,97 @@ class ContentProcessor:
|
|
47 |
|
48 |
def clean_text(self, text: str) -> str:
|
49 |
"""Clean and normalize text content"""
|
50 |
-
# Remove extra whitespace
|
|
|
|
|
|
|
|
|
51 |
text = ' '.join(text.split())
|
|
|
52 |
# Remove common navigation elements
|
53 |
-
|
54 |
"skip to content",
|
55 |
-
"skip to navigation",
|
56 |
"search",
|
57 |
"menu",
|
58 |
-
"
|
59 |
"subscribe",
|
60 |
-
"
|
61 |
-
"
|
62 |
-
"more",
|
63 |
-
"all press releases",
|
64 |
-
"media resources",
|
65 |
-
"media contacts",
|
66 |
-
"investor relations"
|
67 |
]
|
68 |
-
for
|
69 |
-
text = text.replace(
|
70 |
-
|
|
|
71 |
|
72 |
-
def
|
73 |
-
"""Extract
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
# Look for main content containers
|
86 |
-
main_content = soup.find(['main', 'article', 'div'], class_=['content', 'main', 'article'])
|
87 |
-
if main_content:
|
88 |
-
content_elements.append(main_content.get_text())
|
89 |
-
else:
|
90 |
-
# If no main content found, look for paragraphs
|
91 |
-
paragraphs = soup.find_all('p')
|
92 |
-
content_elements.extend(p.get_text() for p in paragraphs)
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
"""Extract key points from text"""
|
98 |
-
# Split into sentences
|
99 |
-
sentences = text.split('.')
|
100 |
-
key_points = []
|
101 |
-
|
102 |
-
for sentence in sentences:
|
103 |
-
# Look for sentences with important keywords
|
104 |
-
keywords = ['quantum', 'computer', 'research', 'development', 'breakthrough', 'innovation']
|
105 |
-
if any(keyword in sentence.lower() for keyword in keywords):
|
106 |
-
cleaned = sentence.strip()
|
107 |
-
if cleaned and len(cleaned) > 20: # Avoid very short sentences
|
108 |
-
key_points.append(cleaned)
|
109 |
-
|
110 |
-
return key_points[:5] # Return top 5 key points
|
111 |
|
112 |
-
def process_content(self, content: str
|
113 |
"""Process content and generate insights"""
|
114 |
try:
|
115 |
-
# Extract main content if HTML is available
|
116 |
-
if soup:
|
117 |
-
content = self.extract_main_content(soup)
|
118 |
-
|
119 |
# Clean the text
|
120 |
-
|
121 |
|
122 |
# Extract key points
|
123 |
-
key_points = self.extract_key_points(
|
124 |
|
125 |
-
# Generate summary
|
126 |
summary = self.model_manager.models['summarizer'](
|
127 |
-
|
128 |
max_length=150,
|
129 |
min_length=50,
|
130 |
do_sample=False
|
131 |
)[0]['summary_text']
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
return {
|
134 |
'summary': summary,
|
135 |
'key_points': key_points,
|
136 |
-
'
|
|
|
137 |
}
|
|
|
138 |
except Exception as e:
|
139 |
return {
|
140 |
'summary': f"Error processing content: {str(e)}",
|
141 |
'key_points': [],
|
|
|
142 |
'content': content
|
143 |
}
|
144 |
|
@@ -215,18 +214,27 @@ class WebSearchEngine:
|
|
215 |
response = self.safe_get(url)
|
216 |
soup = BeautifulSoup(response.text, 'lxml')
|
217 |
|
218 |
-
#
|
219 |
-
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
# Get metadata
|
222 |
metadata = self.get_metadata(soup)
|
223 |
|
|
|
|
|
|
|
224 |
return {
|
225 |
'url': url,
|
226 |
'title': metadata['title'],
|
227 |
'description': metadata['description'],
|
228 |
'summary': processed['summary'],
|
229 |
'key_points': processed['key_points'],
|
|
|
230 |
'content': processed['content']
|
231 |
}
|
232 |
|
@@ -299,38 +307,48 @@ class WebSearchEngine:
|
|
299 |
|
300 |
results = []
|
301 |
all_key_points = []
|
|
|
302 |
|
303 |
for result in search_results:
|
304 |
if 'link' in result:
|
305 |
processed = self.process_url(result['link'])
|
306 |
if 'error' not in processed:
|
307 |
results.append(processed)
|
|
|
308 |
if 'key_points' in processed:
|
309 |
all_key_points.extend(processed['key_points'])
|
|
|
|
|
310 |
time.sleep(random.uniform(0.5, 1.0))
|
311 |
-
|
312 |
if not results:
|
313 |
return {'error': 'Failed to process any search results'}
|
314 |
|
315 |
-
# Combine all summaries
|
316 |
-
|
317 |
|
318 |
-
# Generate
|
319 |
-
|
320 |
-
|
321 |
max_length=200,
|
322 |
min_length=100,
|
323 |
do_sample=False
|
324 |
)[0]['summary_text']
|
325 |
|
|
|
|
|
|
|
326 |
return {
|
327 |
'results': results,
|
328 |
-
'insights':
|
329 |
-
|
|
|
|
|
|
|
330 |
'follow_up_questions': [
|
331 |
-
f"What are the
|
332 |
-
f"
|
333 |
-
f"What are the
|
334 |
]
|
335 |
}
|
336 |
|
|
|
47 |
|
48 |
def clean_text(self, text: str) -> str:
|
49 |
"""Clean and normalize text content"""
|
50 |
+
# Remove extra whitespace and normalize
|
51 |
+
lines = [line.strip() for line in text.splitlines()]
|
52 |
+
text = ' '.join(line for line in lines if line)
|
53 |
+
|
54 |
+
# Remove redundant spaces
|
55 |
text = ' '.join(text.split())
|
56 |
+
|
57 |
# Remove common navigation elements
|
58 |
+
nav_patterns = [
|
59 |
"skip to content",
|
|
|
60 |
"search",
|
61 |
"menu",
|
62 |
+
"navigation",
|
63 |
"subscribe",
|
64 |
+
"sign in",
|
65 |
+
"log in"
|
|
|
|
|
|
|
|
|
|
|
66 |
]
|
67 |
+
for pattern in nav_patterns:
|
68 |
+
text = text.replace(pattern, "")
|
69 |
+
|
70 |
+
return text
|
71 |
|
72 |
+
def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
|
73 |
+
"""Extract key points from text using the summarizer"""
|
74 |
+
try:
|
75 |
+
# Split text into chunks of ~1000 characters
|
76 |
+
chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]
|
77 |
|
78 |
+
all_points = []
|
79 |
+
for chunk in chunks[:3]: # Process first 3 chunks only
|
80 |
+
summary = self.model_manager.models['summarizer'](
|
81 |
+
chunk,
|
82 |
+
max_length=100,
|
83 |
+
min_length=30,
|
84 |
+
do_sample=False
|
85 |
+
)[0]['summary_text']
|
86 |
+
|
87 |
+
# Split into sentences and add as points
|
88 |
+
sentences = [s.strip() for s in summary.split('.') if s.strip()]
|
89 |
+
all_points.extend(sentences)
|
90 |
|
91 |
+
# Return unique points, limited to max_points
|
92 |
+
unique_points = list(dict.fromkeys(all_points))
|
93 |
+
return unique_points[:max_points]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
+
except Exception as e:
|
96 |
+
logger.error(f"Error extracting key points: {str(e)}")
|
97 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
+
def process_content(self, content: str) -> Dict:
|
100 |
"""Process content and generate insights"""
|
101 |
try:
|
|
|
|
|
|
|
|
|
102 |
# Clean the text
|
103 |
+
cleaned_text = self.clean_text(content)
|
104 |
|
105 |
# Extract key points
|
106 |
+
key_points = self.extract_key_points(cleaned_text)
|
107 |
|
108 |
+
# Generate a concise summary
|
109 |
summary = self.model_manager.models['summarizer'](
|
110 |
+
cleaned_text[:1024],
|
111 |
max_length=150,
|
112 |
min_length=50,
|
113 |
do_sample=False
|
114 |
)[0]['summary_text']
|
115 |
|
116 |
+
# Extract potential topics/keywords
|
117 |
+
topics = []
|
118 |
+
common_topics = [
|
119 |
+
"quantum computing", "quantum processors", "quantum bits",
|
120 |
+
"quantum algorithms", "quantum supremacy", "quantum advantage",
|
121 |
+
"error correction", "quantum hardware", "quantum software",
|
122 |
+
"quantum research", "quantum applications"
|
123 |
+
]
|
124 |
+
|
125 |
+
for topic in common_topics:
|
126 |
+
if topic.lower() in cleaned_text.lower():
|
127 |
+
topics.append(topic)
|
128 |
+
|
129 |
return {
|
130 |
'summary': summary,
|
131 |
'key_points': key_points,
|
132 |
+
'topics': topics[:5], # Limit to top 5 topics
|
133 |
+
'content': cleaned_text
|
134 |
}
|
135 |
+
|
136 |
except Exception as e:
|
137 |
return {
|
138 |
'summary': f"Error processing content: {str(e)}",
|
139 |
'key_points': [],
|
140 |
+
'topics': [],
|
141 |
'content': content
|
142 |
}
|
143 |
|
|
|
214 |
response = self.safe_get(url)
|
215 |
soup = BeautifulSoup(response.text, 'lxml')
|
216 |
|
217 |
+
# Extract text content
|
218 |
+
for script in soup(["script", "style"]):
|
219 |
+
script.decompose()
|
220 |
+
text = soup.get_text()
|
221 |
+
lines = (line.strip() for line in text.splitlines())
|
222 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
223 |
+
content = ' '.join(chunk for chunk in chunks if chunk)
|
224 |
|
225 |
# Get metadata
|
226 |
metadata = self.get_metadata(soup)
|
227 |
|
228 |
+
# Process content
|
229 |
+
processed = self.processor.process_content(content)
|
230 |
+
|
231 |
return {
|
232 |
'url': url,
|
233 |
'title': metadata['title'],
|
234 |
'description': metadata['description'],
|
235 |
'summary': processed['summary'],
|
236 |
'key_points': processed['key_points'],
|
237 |
+
'topics': processed['topics'],
|
238 |
'content': processed['content']
|
239 |
}
|
240 |
|
|
|
307 |
|
308 |
results = []
|
309 |
all_key_points = []
|
310 |
+
all_topics = set()
|
311 |
|
312 |
for result in search_results:
|
313 |
if 'link' in result:
|
314 |
processed = self.process_url(result['link'])
|
315 |
if 'error' not in processed:
|
316 |
results.append(processed)
|
317 |
+
# Collect key points and topics
|
318 |
if 'key_points' in processed:
|
319 |
all_key_points.extend(processed['key_points'])
|
320 |
+
if 'topics' in processed:
|
321 |
+
all_topics.update(processed.get('topics', []))
|
322 |
time.sleep(random.uniform(0.5, 1.0))
|
323 |
+
|
324 |
if not results:
|
325 |
return {'error': 'Failed to process any search results'}
|
326 |
|
327 |
+
# Combine all summaries
|
328 |
+
all_summaries = " ".join([r['summary'] for r in results if 'summary' in r])
|
329 |
|
330 |
+
# Generate a meta-summary of all content
|
331 |
+
meta_summary = self.processor.model_manager.models['summarizer'](
|
332 |
+
all_summaries[:1024],
|
333 |
max_length=200,
|
334 |
min_length=100,
|
335 |
do_sample=False
|
336 |
)[0]['summary_text']
|
337 |
|
338 |
+
# Get unique key points
|
339 |
+
unique_key_points = list(dict.fromkeys(all_key_points))
|
340 |
+
|
341 |
return {
|
342 |
'results': results,
|
343 |
+
'insights': {
|
344 |
+
'summary': meta_summary,
|
345 |
+
'key_points': unique_key_points[:7], # Top 7 key points
|
346 |
+
'topics': list(all_topics)[:5] # Top 5 topics
|
347 |
+
},
|
348 |
'follow_up_questions': [
|
349 |
+
f"What are the recent breakthroughs in {', '.join(list(all_topics)[:2])}?",
|
350 |
+
f"How do these developments impact the future of quantum computing?",
|
351 |
+
f"What are the practical applications of these quantum computing advances?"
|
352 |
]
|
353 |
}
|
354 |
|