Spaces:
Build error
Build error
fikird
commited on
Commit
Β·
f424b55
1
Parent(s):
03649cb
Improve content processing and result formatting
Browse files- app.py +33 -31
- search_engine.py +81 -85
app.py
CHANGED
@@ -4,6 +4,7 @@ import torch
|
|
4 |
import os
|
5 |
import logging
|
6 |
import traceback
|
|
|
7 |
|
8 |
# Configure logging
|
9 |
logging.basicConfig(
|
@@ -28,42 +29,46 @@ def safe_search(query, max_results):
|
|
28 |
return f"# β Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
|
29 |
|
30 |
def format_results(results):
|
31 |
-
"""Format search results into a
|
32 |
if 'error' in results:
|
33 |
return f"β Error: {results['error']}"
|
34 |
|
35 |
output = []
|
36 |
|
37 |
# Add insights section
|
38 |
-
if 'insights' in results:
|
39 |
-
|
40 |
-
output.append("
|
41 |
-
|
42 |
-
|
43 |
-
output.append(insights['summary'])
|
44 |
-
output.append("\n")
|
45 |
-
|
46 |
-
if 'key_points' in insights and len(insights['key_points']) > 5:
|
47 |
-
output.append("\n## π Additional Points\n")
|
48 |
-
for point in insights['key_points'][5:]:
|
49 |
-
output.append(f"β’ {point}")
|
50 |
-
output.append("\n")
|
51 |
|
52 |
-
# Add
|
53 |
-
if '
|
54 |
-
output.append("
|
55 |
-
for
|
56 |
-
output.append(f"
|
57 |
-
|
58 |
-
output.append(f"π [View Source]({source['url']})\n")
|
59 |
-
if 'summary' in source:
|
60 |
-
output.append(f"\n{source['summary']}\n")
|
61 |
|
62 |
-
# Add
|
63 |
-
if '
|
64 |
-
output.append("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
for question in results['follow_up_questions']:
|
66 |
-
output.append(f"β’ {question}
|
67 |
|
68 |
return "\n".join(output)
|
69 |
|
@@ -90,10 +95,7 @@ def create_demo():
|
|
90 |
)
|
91 |
search_button = gr.Button("π Search")
|
92 |
|
93 |
-
output = gr.Markdown(
|
94 |
-
label="Search Results",
|
95 |
-
show_label=True
|
96 |
-
)
|
97 |
|
98 |
search_button.click(
|
99 |
fn=safe_search,
|
|
|
4 |
import os
|
5 |
import logging
|
6 |
import traceback
|
7 |
+
import textwrap
|
8 |
|
9 |
# Configure logging
|
10 |
logging.basicConfig(
|
|
|
29 |
return f"# β Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
|
30 |
|
31 |
def format_results(results):
|
32 |
+
"""Format search results into a readable markdown string"""
|
33 |
if 'error' in results:
|
34 |
return f"β Error: {results['error']}"
|
35 |
|
36 |
output = []
|
37 |
|
38 |
# Add insights section
|
39 |
+
if 'insights' in results and results['insights']:
|
40 |
+
output.append("# π Search Results\n")
|
41 |
+
output.append("## π‘ Key Insights")
|
42 |
+
output.append(results['insights'])
|
43 |
+
output.append("")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
+
# Add key points section
|
46 |
+
if 'key_points' in results and results['key_points']:
|
47 |
+
output.append("## π― Key Points")
|
48 |
+
for point in results['key_points']:
|
49 |
+
output.append(f"β’ {point}")
|
50 |
+
output.append("")
|
|
|
|
|
|
|
51 |
|
52 |
+
# Add detailed results section
|
53 |
+
if 'results' in results and results['results']:
|
54 |
+
output.append("## π Detailed Results")
|
55 |
+
for i, result in enumerate(results['results'], 1):
|
56 |
+
output.append(f"### {i}. {result['title']}")
|
57 |
+
if 'description' in result and result['description']:
|
58 |
+
output.append(f"*{result['description']}*")
|
59 |
+
if 'summary' in result and result['summary']:
|
60 |
+
# Wrap summary text for better readability
|
61 |
+
wrapped_summary = textwrap.fill(result['summary'], width=80)
|
62 |
+
output.append(f"\n{wrapped_summary}")
|
63 |
+
if 'url' in result:
|
64 |
+
output.append(f"\nπ [Read more]({result['url']})")
|
65 |
+
output.append("")
|
66 |
+
|
67 |
+
# Add follow-up questions section
|
68 |
+
if 'follow_up_questions' in results and results['follow_up_questions']:
|
69 |
+
output.append("## β Follow-up Questions")
|
70 |
for question in results['follow_up_questions']:
|
71 |
+
output.append(f"β’ {question}")
|
72 |
|
73 |
return "\n".join(output)
|
74 |
|
|
|
95 |
)
|
96 |
search_button = gr.Button("π Search")
|
97 |
|
98 |
+
output = gr.Markdown()
|
|
|
|
|
|
|
99 |
|
100 |
search_button.click(
|
101 |
fn=safe_search,
|
search_engine.py
CHANGED
@@ -50,52 +50,77 @@ class ContentProcessor:
|
|
50 |
# Remove extra whitespace
|
51 |
text = ' '.join(text.split())
|
52 |
# Remove common navigation elements
|
53 |
-
nav_elements = [
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
return text.strip()
|
57 |
-
|
58 |
-
def
|
59 |
-
"""Extract
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
all_points = []
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
max_length=100,
|
69 |
-
min_length=30,
|
70 |
-
do_sample=False
|
71 |
-
)[0]['summary_text']
|
72 |
-
|
73 |
-
# Split summary into sentences
|
74 |
-
points = [p.strip() for p in summary.split('.') if p.strip()]
|
75 |
-
all_points.extend(points)
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
-
|
83 |
-
logger.error(f"Error extracting key points: {str(e)}")
|
84 |
-
return []
|
85 |
|
86 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
"""Process content and generate insights"""
|
88 |
try:
|
89 |
-
#
|
|
|
|
|
|
|
|
|
90 |
cleaned_content = self.clean_text(content)
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
'summary': "No meaningful content found",
|
95 |
-
'content': content,
|
96 |
-
'key_points': [],
|
97 |
-
'topics': []
|
98 |
-
}
|
99 |
|
100 |
# Generate summary
|
101 |
summary = self.model_manager.models['summarizer'](
|
@@ -105,30 +130,16 @@ class ContentProcessor:
|
|
105 |
do_sample=False
|
106 |
)[0]['summary_text']
|
107 |
|
108 |
-
# Extract key points
|
109 |
-
key_points = self.extract_key_points(cleaned_content)
|
110 |
-
|
111 |
-
# Extract main topics using embeddings
|
112 |
-
embeddings = self.model_manager.models['embeddings'].embed_documents(
|
113 |
-
[cleaned_content[:2048]]
|
114 |
-
)
|
115 |
-
|
116 |
-
# You could add topic modeling here if needed
|
117 |
-
|
118 |
return {
|
119 |
'summary': summary,
|
120 |
-
'content': cleaned_content,
|
121 |
'key_points': key_points,
|
122 |
-
'
|
123 |
}
|
124 |
-
|
125 |
except Exception as e:
|
126 |
-
logger.error(f"Error processing content: {str(e)}")
|
127 |
return {
|
128 |
'summary': f"Error processing content: {str(e)}",
|
129 |
-
'content': content,
|
130 |
'key_points': [],
|
131 |
-
'
|
132 |
}
|
133 |
|
134 |
class WebSearchEngine:
|
@@ -204,28 +215,19 @@ class WebSearchEngine:
|
|
204 |
response = self.safe_get(url)
|
205 |
soup = BeautifulSoup(response.text, 'lxml')
|
206 |
|
207 |
-
#
|
208 |
-
|
209 |
-
script.decompose()
|
210 |
-
text = soup.get_text()
|
211 |
-
lines = (line.strip() for line in text.splitlines())
|
212 |
-
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
213 |
-
content = ' '.join(chunk for chunk in chunks if chunk)
|
214 |
|
215 |
# Get metadata
|
216 |
metadata = self.get_metadata(soup)
|
217 |
|
218 |
-
# Process content
|
219 |
-
processed = self.processor.process_content(content)
|
220 |
-
|
221 |
return {
|
222 |
'url': url,
|
223 |
'title': metadata['title'],
|
224 |
'description': metadata['description'],
|
225 |
'summary': processed['summary'],
|
226 |
-
'content': processed['content'],
|
227 |
'key_points': processed['key_points'],
|
228 |
-
'
|
229 |
}
|
230 |
|
231 |
except Exception as e:
|
@@ -306,35 +308,29 @@ class WebSearchEngine:
|
|
306 |
if 'key_points' in processed:
|
307 |
all_key_points.extend(processed['key_points'])
|
308 |
time.sleep(random.uniform(0.5, 1.0))
|
309 |
-
|
310 |
if not results:
|
311 |
return {'error': 'Failed to process any search results'}
|
312 |
|
313 |
-
# Combine and
|
314 |
-
|
315 |
-
unique_points.sort(key=len, reverse=True)
|
316 |
|
317 |
-
# Generate
|
318 |
-
insights =
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
'url': r.get('url', ''),
|
325 |
-
'summary': r.get('summary', '')
|
326 |
-
}
|
327 |
-
for r in results
|
328 |
-
]
|
329 |
-
}
|
330 |
|
331 |
return {
|
332 |
'results': results,
|
333 |
'insights': insights,
|
|
|
334 |
'follow_up_questions': [
|
335 |
-
f"What are the
|
336 |
-
f"
|
337 |
-
f"What are the
|
338 |
]
|
339 |
}
|
340 |
|
|
|
50 |
# Remove extra whitespace
|
51 |
text = ' '.join(text.split())
|
52 |
# Remove common navigation elements
|
53 |
+
nav_elements = [
|
54 |
+
"skip to content",
|
55 |
+
"skip to navigation",
|
56 |
+
"search",
|
57 |
+
"menu",
|
58 |
+
"submit",
|
59 |
+
"subscribe",
|
60 |
+
"browse",
|
61 |
+
"explore",
|
62 |
+
"more",
|
63 |
+
"all press releases",
|
64 |
+
"media resources",
|
65 |
+
"media contacts",
|
66 |
+
"investor relations"
|
67 |
+
]
|
68 |
+
for element in nav_elements:
|
69 |
+
text = text.replace(element.lower(), "")
|
70 |
return text.strip()
|
71 |
+
|
72 |
+
def extract_main_content(self, soup: BeautifulSoup) -> str:
|
73 |
+
"""Extract main content from HTML"""
|
74 |
+
# Remove navigation, header, footer, and sidebar elements
|
75 |
+
for elem in soup.find_all(['nav', 'header', 'footer', 'aside']):
|
76 |
+
elem.decompose()
|
|
|
77 |
|
78 |
+
# Remove common non-content elements
|
79 |
+
for elem in soup.find_all(class_=['menu', 'navigation', 'sidebar', 'footer', 'header']):
|
80 |
+
elem.decompose()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
+
# Extract text from remaining elements
|
83 |
+
content_elements = []
|
84 |
+
|
85 |
+
# Look for main content containers
|
86 |
+
main_content = soup.find(['main', 'article', 'div'], class_=['content', 'main', 'article'])
|
87 |
+
if main_content:
|
88 |
+
content_elements.append(main_content.get_text())
|
89 |
+
else:
|
90 |
+
# If no main content found, look for paragraphs
|
91 |
+
paragraphs = soup.find_all('p')
|
92 |
+
content_elements.extend(p.get_text() for p in paragraphs)
|
93 |
|
94 |
+
return ' '.join(content_elements)
|
|
|
|
|
95 |
|
96 |
+
def extract_key_points(self, text: str) -> List[str]:
|
97 |
+
"""Extract key points from text"""
|
98 |
+
# Split into sentences
|
99 |
+
sentences = text.split('.')
|
100 |
+
key_points = []
|
101 |
+
|
102 |
+
for sentence in sentences:
|
103 |
+
# Look for sentences with important keywords
|
104 |
+
keywords = ['quantum', 'computer', 'research', 'development', 'breakthrough', 'innovation']
|
105 |
+
if any(keyword in sentence.lower() for keyword in keywords):
|
106 |
+
cleaned = sentence.strip()
|
107 |
+
if cleaned and len(cleaned) > 20: # Avoid very short sentences
|
108 |
+
key_points.append(cleaned)
|
109 |
+
|
110 |
+
return key_points[:5] # Return top 5 key points
|
111 |
+
|
112 |
+
def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
|
113 |
"""Process content and generate insights"""
|
114 |
try:
|
115 |
+
# Extract main content if HTML is available
|
116 |
+
if soup:
|
117 |
+
content = self.extract_main_content(soup)
|
118 |
+
|
119 |
+
# Clean the text
|
120 |
cleaned_content = self.clean_text(content)
|
121 |
|
122 |
+
# Extract key points
|
123 |
+
key_points = self.extract_key_points(cleaned_content)
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
# Generate summary
|
126 |
summary = self.model_manager.models['summarizer'](
|
|
|
130 |
do_sample=False
|
131 |
)[0]['summary_text']
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
return {
|
134 |
'summary': summary,
|
|
|
135 |
'key_points': key_points,
|
136 |
+
'content': cleaned_content
|
137 |
}
|
|
|
138 |
except Exception as e:
|
|
|
139 |
return {
|
140 |
'summary': f"Error processing content: {str(e)}",
|
|
|
141 |
'key_points': [],
|
142 |
+
'content': content
|
143 |
}
|
144 |
|
145 |
class WebSearchEngine:
|
|
|
215 |
response = self.safe_get(url)
|
216 |
soup = BeautifulSoup(response.text, 'lxml')
|
217 |
|
218 |
+
# Process content with HTML context
|
219 |
+
processed = self.processor.process_content(response.text, soup)
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
# Get metadata
|
222 |
metadata = self.get_metadata(soup)
|
223 |
|
|
|
|
|
|
|
224 |
return {
|
225 |
'url': url,
|
226 |
'title': metadata['title'],
|
227 |
'description': metadata['description'],
|
228 |
'summary': processed['summary'],
|
|
|
229 |
'key_points': processed['key_points'],
|
230 |
+
'content': processed['content']
|
231 |
}
|
232 |
|
233 |
except Exception as e:
|
|
|
308 |
if 'key_points' in processed:
|
309 |
all_key_points.extend(processed['key_points'])
|
310 |
time.sleep(random.uniform(0.5, 1.0))
|
311 |
+
|
312 |
if not results:
|
313 |
return {'error': 'Failed to process any search results'}
|
314 |
|
315 |
+
# Combine all summaries and key points
|
316 |
+
combined_summary = " ".join([r['summary'] for r in results if 'summary' in r])
|
|
|
317 |
|
318 |
+
# Generate final insights
|
319 |
+
insights = self.processor.model_manager.models['summarizer'](
|
320 |
+
combined_summary,
|
321 |
+
max_length=200,
|
322 |
+
min_length=100,
|
323 |
+
do_sample=False
|
324 |
+
)[0]['summary_text']
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
|
326 |
return {
|
327 |
'results': results,
|
328 |
'insights': insights,
|
329 |
+
'key_points': list(set(all_key_points)), # Remove duplicates
|
330 |
'follow_up_questions': [
|
331 |
+
f"What are the key differences between {query} and related topics?",
|
332 |
+
f"Can you explain {query} in simple terms?",
|
333 |
+
f"What are the latest developments in {query}?"
|
334 |
]
|
335 |
}
|
336 |
|