Spaces:
Build error
Build error
fikird
commited on
Commit
Β·
03649cb
1
Parent(s):
636f8ae
Improve content processing and result formatting
Browse files- app.py +35 -42
- search_engine.py +74 -60
app.py
CHANGED
@@ -28,54 +28,44 @@ def safe_search(query, max_results):
|
|
28 |
return f"# β Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
|
29 |
|
30 |
def format_results(results):
|
31 |
-
"""Format search results
|
32 |
-
if
|
33 |
-
return "
|
34 |
-
|
35 |
-
|
36 |
|
37 |
# Add insights section
|
38 |
if 'insights' in results:
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
# Add follow-up questions
|
42 |
if 'follow_up_questions' in results:
|
43 |
-
|
44 |
-
for
|
45 |
-
|
46 |
-
formatted += f"- {q.strip()}\n"
|
47 |
-
formatted += "\n"
|
48 |
-
|
49 |
-
# Add main results
|
50 |
-
if 'results' in results:
|
51 |
-
formatted += "## π Detailed Results\n\n"
|
52 |
-
for i, result in enumerate(results['results'], 1):
|
53 |
-
if not isinstance(result, dict):
|
54 |
-
continue
|
55 |
-
|
56 |
-
formatted += f"### {i}. "
|
57 |
-
if 'url' in result:
|
58 |
-
title = result.get('title', 'Untitled')
|
59 |
-
formatted += f"[{title}]({result['url']})\n"
|
60 |
-
if 'summary' in result:
|
61 |
-
formatted += f"\n{result['summary']}\n\n"
|
62 |
-
|
63 |
-
# Add similar chunks if available
|
64 |
-
if 'similar_chunks' in results:
|
65 |
-
formatted += "## π Related Content\n\n"
|
66 |
-
for i, chunk in enumerate(results['similar_chunks'], 1):
|
67 |
-
if not isinstance(chunk, dict):
|
68 |
-
continue
|
69 |
-
|
70 |
-
formatted += f"### Related {i}\n"
|
71 |
-
if 'metadata' in chunk:
|
72 |
-
meta = chunk['metadata']
|
73 |
-
if 'title' in meta and 'url' in meta:
|
74 |
-
formatted += f"From [{meta['title']}]({meta['url']})\n"
|
75 |
-
if 'content' in chunk:
|
76 |
-
formatted += f"\n{chunk['content'][:200]}...\n\n"
|
77 |
|
78 |
-
return
|
79 |
|
80 |
def create_demo():
|
81 |
"""Create the Gradio interface"""
|
@@ -100,7 +90,10 @@ def create_demo():
|
|
100 |
)
|
101 |
search_button = gr.Button("π Search")
|
102 |
|
103 |
-
output = gr.Markdown(
|
|
|
|
|
|
|
104 |
|
105 |
search_button.click(
|
106 |
fn=safe_search,
|
|
|
28 |
return f"# β Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
|
29 |
|
30 |
def format_results(results):
|
31 |
+
"""Format search results into a clean markdown output"""
|
32 |
+
if 'error' in results:
|
33 |
+
return f"β Error: {results['error']}"
|
34 |
+
|
35 |
+
output = []
|
36 |
|
37 |
# Add insights section
|
38 |
if 'insights' in results:
|
39 |
+
insights = results['insights']
|
40 |
+
output.append("# π― Key Insights\n")
|
41 |
+
|
42 |
+
if 'summary' in insights:
|
43 |
+
output.append(insights['summary'])
|
44 |
+
output.append("\n")
|
45 |
+
|
46 |
+
if 'key_points' in insights and len(insights['key_points']) > 5:
|
47 |
+
output.append("\n## π Additional Points\n")
|
48 |
+
for point in insights['key_points'][5:]:
|
49 |
+
output.append(f"β’ {point}")
|
50 |
+
output.append("\n")
|
51 |
+
|
52 |
+
# Add sources section
|
53 |
+
if 'insights' in results and 'sources' in results['insights']:
|
54 |
+
output.append("\n# π Sources\n")
|
55 |
+
for idx, source in enumerate(results['insights']['sources'], 1):
|
56 |
+
output.append(f"\n## {idx}. {source['title']}\n")
|
57 |
+
if 'url' in source:
|
58 |
+
output.append(f"π [View Source]({source['url']})\n")
|
59 |
+
if 'summary' in source:
|
60 |
+
output.append(f"\n{source['summary']}\n")
|
61 |
|
62 |
# Add follow-up questions
|
63 |
if 'follow_up_questions' in results:
|
64 |
+
output.append("\n# β Suggested Questions\n")
|
65 |
+
for question in results['follow_up_questions']:
|
66 |
+
output.append(f"β’ {question}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
+
return "\n".join(output)
|
69 |
|
70 |
def create_demo():
|
71 |
"""Create the Gradio interface"""
|
|
|
90 |
)
|
91 |
search_button = gr.Button("π Search")
|
92 |
|
93 |
+
output = gr.Markdown(
|
94 |
+
label="Search Results",
|
95 |
+
show_label=True
|
96 |
+
)
|
97 |
|
98 |
search_button.click(
|
99 |
fn=safe_search,
|
search_engine.py
CHANGED
@@ -49,77 +49,86 @@ class ContentProcessor:
|
|
49 |
"""Clean and normalize text content"""
|
50 |
# Remove extra whitespace
|
51 |
text = ' '.join(text.split())
|
52 |
-
# Remove
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
if not any(header in line for header in common_headers) and len(line) > 20:
|
58 |
-
lines.append(line)
|
59 |
-
return ' '.join(lines)
|
60 |
|
61 |
-
def extract_key_points(self, content: str) -> List[str]:
|
62 |
"""Extract key points from content using AI"""
|
63 |
try:
|
64 |
-
# Split content into chunks for processing
|
65 |
-
chunks = [content[i:i+1024] for i in range(0, len(content), 1024)]
|
66 |
-
|
67 |
|
68 |
for chunk in chunks:
|
69 |
-
# Generate focused summary for each chunk
|
70 |
summary = self.model_manager.models['summarizer'](
|
71 |
chunk,
|
72 |
-
max_length=
|
73 |
-
min_length=
|
74 |
-
do_sample=False
|
75 |
-
num_beams=4,
|
76 |
-
length_penalty=2.0,
|
77 |
-
early_stopping=True
|
78 |
)[0]['summary_text']
|
79 |
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
-
return key_points
|
83 |
except Exception as e:
|
84 |
logger.error(f"Error extracting key points: {str(e)}")
|
85 |
return []
|
86 |
|
87 |
-
def process_content(self, content: str
|
88 |
"""Process content and generate insights"""
|
89 |
try:
|
90 |
# Clean the content
|
91 |
cleaned_content = self.clean_text(content)
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
# Extract key points
|
100 |
key_points = self.extract_key_points(cleaned_content)
|
101 |
|
102 |
-
#
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
num_beams=4,
|
109 |
-
length_penalty=2.0,
|
110 |
-
early_stopping=True
|
111 |
-
)[0]['summary_text']
|
112 |
|
113 |
return {
|
114 |
'summary': summary,
|
|
|
115 |
'key_points': key_points,
|
116 |
-
'
|
117 |
}
|
|
|
118 |
except Exception as e:
|
|
|
119 |
return {
|
120 |
'summary': f"Error processing content: {str(e)}",
|
|
|
121 |
'key_points': [],
|
122 |
-
'
|
123 |
}
|
124 |
|
125 |
class WebSearchEngine:
|
@@ -207,15 +216,16 @@ class WebSearchEngine:
|
|
207 |
metadata = self.get_metadata(soup)
|
208 |
|
209 |
# Process content
|
210 |
-
processed = self.processor.process_content(content
|
211 |
|
212 |
return {
|
213 |
'url': url,
|
214 |
'title': metadata['title'],
|
215 |
'description': metadata['description'],
|
216 |
'summary': processed['summary'],
|
|
|
217 |
'key_points': processed['key_points'],
|
218 |
-
'
|
219 |
}
|
220 |
|
221 |
except Exception as e:
|
@@ -292,36 +302,40 @@ class WebSearchEngine:
|
|
292 |
if 'link' in result:
|
293 |
processed = self.process_url(result['link'])
|
294 |
if 'error' not in processed:
|
295 |
-
# Add original search snippet
|
296 |
-
processed['snippet'] = result.get('snippet', '')
|
297 |
results.append(processed)
|
298 |
-
# Collect key points
|
299 |
if 'key_points' in processed:
|
300 |
all_key_points.extend(processed['key_points'])
|
301 |
time.sleep(random.uniform(0.5, 1.0))
|
302 |
-
|
303 |
if not results:
|
304 |
return {'error': 'Failed to process any search results'}
|
305 |
|
306 |
-
#
|
307 |
-
|
308 |
-
|
309 |
-
# Group similar points and remove duplicates
|
310 |
-
unique_points = list(set(all_key_points))
|
311 |
-
insights = self.processor.extract_key_points(' '.join(unique_points))
|
312 |
|
313 |
-
# Generate
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
|
321 |
return {
|
322 |
'results': results,
|
323 |
-
'insights': insights
|
324 |
-
'follow_up_questions':
|
|
|
|
|
|
|
|
|
325 |
}
|
326 |
|
327 |
except Exception as e:
|
|
|
49 |
"""Clean and normalize text content"""
|
50 |
# Remove extra whitespace
|
51 |
text = ' '.join(text.split())
|
52 |
+
# Remove common navigation elements
|
53 |
+
nav_elements = ['Skip to content', 'Search', 'Menu', 'Navigation', 'Subscribe', 'Follow']
|
54 |
+
for elem in nav_elements:
|
55 |
+
text = text.replace(elem, '')
|
56 |
+
return text.strip()
|
|
|
|
|
|
|
57 |
|
58 |
+
def extract_key_points(self, content: str, max_points: int = 5) -> List[str]:
|
59 |
"""Extract key points from content using AI"""
|
60 |
try:
|
61 |
+
# Split content into smaller chunks for processing
|
62 |
+
chunks = [content[i:i + 1024] for i in range(0, len(content), 1024)]
|
63 |
+
all_points = []
|
64 |
|
65 |
for chunk in chunks:
|
|
|
66 |
summary = self.model_manager.models['summarizer'](
|
67 |
chunk,
|
68 |
+
max_length=100,
|
69 |
+
min_length=30,
|
70 |
+
do_sample=False
|
|
|
|
|
|
|
71 |
)[0]['summary_text']
|
72 |
|
73 |
+
# Split summary into sentences
|
74 |
+
points = [p.strip() for p in summary.split('.') if p.strip()]
|
75 |
+
all_points.extend(points)
|
76 |
+
|
77 |
+
# Return unique points, prioritizing longer, more informative ones
|
78 |
+
unique_points = list(set(all_points))
|
79 |
+
unique_points.sort(key=len, reverse=True)
|
80 |
+
return unique_points[:max_points]
|
81 |
|
|
|
82 |
except Exception as e:
|
83 |
logger.error(f"Error extracting key points: {str(e)}")
|
84 |
return []
|
85 |
|
86 |
+
def process_content(self, content: str) -> Dict:
|
87 |
"""Process content and generate insights"""
|
88 |
try:
|
89 |
# Clean the content
|
90 |
cleaned_content = self.clean_text(content)
|
91 |
|
92 |
+
if not cleaned_content:
|
93 |
+
return {
|
94 |
+
'summary': "No meaningful content found",
|
95 |
+
'content': content,
|
96 |
+
'key_points': [],
|
97 |
+
'topics': []
|
98 |
+
}
|
99 |
+
|
100 |
+
# Generate summary
|
101 |
+
summary = self.model_manager.models['summarizer'](
|
102 |
+
cleaned_content[:1024],
|
103 |
+
max_length=150,
|
104 |
+
min_length=50,
|
105 |
+
do_sample=False
|
106 |
+
)[0]['summary_text']
|
107 |
|
108 |
# Extract key points
|
109 |
key_points = self.extract_key_points(cleaned_content)
|
110 |
|
111 |
+
# Extract main topics using embeddings
|
112 |
+
embeddings = self.model_manager.models['embeddings'].embed_documents(
|
113 |
+
[cleaned_content[:2048]]
|
114 |
+
)
|
115 |
+
|
116 |
+
# You could add topic modeling here if needed
|
|
|
|
|
|
|
|
|
117 |
|
118 |
return {
|
119 |
'summary': summary,
|
120 |
+
'content': cleaned_content,
|
121 |
'key_points': key_points,
|
122 |
+
'topics': [] # Reserved for future topic modeling
|
123 |
}
|
124 |
+
|
125 |
except Exception as e:
|
126 |
+
logger.error(f"Error processing content: {str(e)}")
|
127 |
return {
|
128 |
'summary': f"Error processing content: {str(e)}",
|
129 |
+
'content': content,
|
130 |
'key_points': [],
|
131 |
+
'topics': []
|
132 |
}
|
133 |
|
134 |
class WebSearchEngine:
|
|
|
216 |
metadata = self.get_metadata(soup)
|
217 |
|
218 |
# Process content
|
219 |
+
processed = self.processor.process_content(content)
|
220 |
|
221 |
return {
|
222 |
'url': url,
|
223 |
'title': metadata['title'],
|
224 |
'description': metadata['description'],
|
225 |
'summary': processed['summary'],
|
226 |
+
'content': processed['content'],
|
227 |
'key_points': processed['key_points'],
|
228 |
+
'topics': processed['topics']
|
229 |
}
|
230 |
|
231 |
except Exception as e:
|
|
|
302 |
if 'link' in result:
|
303 |
processed = self.process_url(result['link'])
|
304 |
if 'error' not in processed:
|
|
|
|
|
305 |
results.append(processed)
|
|
|
306 |
if 'key_points' in processed:
|
307 |
all_key_points.extend(processed['key_points'])
|
308 |
time.sleep(random.uniform(0.5, 1.0))
|
309 |
+
|
310 |
if not results:
|
311 |
return {'error': 'Failed to process any search results'}
|
312 |
|
313 |
+
# Combine and deduplicate key points
|
314 |
+
unique_points = list(set(all_key_points))
|
315 |
+
unique_points.sort(key=len, reverse=True)
|
|
|
|
|
|
|
316 |
|
317 |
+
# Generate comprehensive insights
|
318 |
+
insights = {
|
319 |
+
'summary': "Key Findings:\n" + "\n".join(f"β’ {point}" for point in unique_points[:5]),
|
320 |
+
'key_points': unique_points[:10],
|
321 |
+
'sources': [
|
322 |
+
{
|
323 |
+
'title': r.get('title', 'Untitled'),
|
324 |
+
'url': r.get('url', ''),
|
325 |
+
'summary': r.get('summary', '')
|
326 |
+
}
|
327 |
+
for r in results
|
328 |
+
]
|
329 |
+
}
|
330 |
|
331 |
return {
|
332 |
'results': results,
|
333 |
+
'insights': insights,
|
334 |
+
'follow_up_questions': [
|
335 |
+
f"What are the practical applications of {query}?",
|
336 |
+
f"How does {query} impact current technology?",
|
337 |
+
f"What are the future prospects for {query}?"
|
338 |
+
]
|
339 |
}
|
340 |
|
341 |
except Exception as e:
|