fikird commited on
Commit
2f58cc7
·
1 Parent(s): f2c01c1

Improve content processing with better extraction and formatting

Browse files
Files changed (1) hide show
  1. search_engine.py +114 -105
search_engine.py CHANGED
@@ -51,111 +51,101 @@ class ContentProcessor:
51
  text = ' '.join(text.split())
52
  # Remove common navigation elements
53
  nav_elements = [
54
- "skip to content",
55
- "search",
56
- "menu",
57
- "navigation",
58
- "subscribe",
59
- "sign in",
60
- "log in",
61
- "submit",
62
- "browse",
 
 
 
63
  ]
64
  for element in nav_elements:
65
- text = text.replace(element.lower(), "")
66
  return text.strip()
67
 
68
  def extract_main_content(self, soup: BeautifulSoup) -> str:
69
- """Extract main content from HTML, prioritizing article content"""
70
- content = ""
71
-
72
- # Try to find main content containers
73
- priority_tags = [
74
- ('article', {}),
75
- ('div', {'class': ['article', 'post', 'content', 'main']}),
76
- ('div', {'id': ['article', 'post', 'content', 'main']}),
77
- ('main', {}),
78
- ]
79
-
80
- for tag, attrs in priority_tags:
81
- elements = soup.find_all(tag, attrs)
82
- if elements:
83
- content = " ".join(elem.get_text(strip=True) for elem in elements)
84
- if content:
85
- break
86
-
87
- # If no main content found, try extracting paragraphs
88
- if not content:
89
- paragraphs = soup.find_all('p')
90
- content = " ".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 100)
91
 
92
- return self.clean_text(content)
93
-
94
- def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
95
- """Extract key points from text using sentence transformers"""
96
- try:
97
- # Split into sentences
98
- sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 20]
99
- if not sentences:
100
- return []
101
 
102
- # Get embeddings for sentences
103
- embeddings = self.model_manager.models['embeddings'].embed_documents(sentences)
104
 
105
- # Use simple clustering to find diverse sentences
106
- selected_indices = [0] # Start with first sentence
107
- for _ in range(min(max_points - 1, len(sentences) - 1)):
108
- # Find sentence most different from selected ones
109
- max_diff = -1
110
- max_idx = -1
111
- for i in range(len(sentences)):
112
- if i not in selected_indices:
113
- # Calculate average difference from selected sentences
114
- diffs = [sum((embeddings[i][j] - embeddings[k][j])**2
115
- for j in range(len(embeddings[i])))
116
- for k in selected_indices]
117
- avg_diff = sum(diffs) / len(diffs)
118
- if avg_diff > max_diff:
119
- max_diff = avg_diff
120
- max_idx = i
121
- if max_idx != -1:
122
- selected_indices.append(max_idx)
123
 
124
- return [sentences[i] for i in selected_indices]
125
- except Exception as e:
126
- logger.error(f"Error extracting key points: {str(e)}")
127
- return []
128
 
129
- def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
130
  """Process content and generate insights"""
131
  try:
132
- # Extract main content if HTML is available
133
- if soup:
134
- content = self.extract_main_content(soup)
135
- else:
136
- content = self.clean_text(content)
137
 
138
- # Generate summary
139
- summary = self.model_manager.models['summarizer'](
140
- content[:1024],
141
- max_length=150,
142
- min_length=50,
143
- do_sample=False
144
- )[0]['summary_text']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- # Extract key points
147
- key_points = self.extract_key_points(content)
 
 
 
 
 
 
 
 
 
148
 
149
  return {
150
- 'summary': summary,
151
- 'content': content,
152
- 'key_points': key_points
153
  }
154
  except Exception as e:
155
  return {
156
  'summary': f"Error processing content: {str(e)}",
157
- 'content': content,
158
- 'key_points': []
159
  }
160
 
161
  class WebSearchEngine:
@@ -234,8 +224,11 @@ class WebSearchEngine:
234
  # Get metadata
235
  metadata = self.get_metadata(soup)
236
 
237
- # Process content
238
- processed = self.processor.process_content("", soup=soup)
 
 
 
239
 
240
  return {
241
  'url': url,
@@ -249,6 +242,35 @@ class WebSearchEngine:
249
  except Exception as e:
250
  return {'error': f"Error processing {url}: {str(e)}"}
251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  def search_duckduckgo(self, query: str, max_results: int = 5) -> List[Dict]:
253
  """Search DuckDuckGo and parse HTML results"""
254
  search_results = []
@@ -314,35 +336,22 @@ class WebSearchEngine:
314
  return {'error': 'No results found'}
315
 
316
  results = []
317
- all_key_points = []
318
-
319
  for result in search_results:
320
  if 'link' in result:
321
  processed = self.process_url(result['link'])
322
  if 'error' not in processed:
323
  results.append(processed)
324
- if 'key_points' in processed:
325
- all_key_points.extend(processed['key_points'])
326
  time.sleep(random.uniform(0.5, 1.0))
327
-
328
  if not results:
329
  return {'error': 'Failed to process any search results'}
330
 
331
- # Combine insights from all results
332
- combined_summary = " ".join([r['summary'] for r in results if 'summary' in r])
333
-
334
- # Generate overall insights
335
- insights = self.processor.model_manager.models['summarizer'](
336
- combined_summary,
337
- max_length=200,
338
- min_length=100,
339
- do_sample=False
340
- )[0]['summary_text']
341
 
342
  return {
343
- 'results': results,
344
- 'insights': insights,
345
- 'key_points': all_key_points[:10], # Top 10 key points
346
  'follow_up_questions': [
347
  f"What are the recent breakthroughs in {query}?",
348
  f"How does {query} impact various industries?",
 
51
  text = ' '.join(text.split())
52
  # Remove common navigation elements
53
  nav_elements = [
54
+ "Skip to content",
55
+ "Search",
56
+ "Menu",
57
+ "Navigation",
58
+ "Subscribe",
59
+ "Browse",
60
+ "Submit",
61
+ "More",
62
+ "About",
63
+ "Contact",
64
+ "Privacy Policy",
65
+ "Terms of Use"
66
  ]
67
  for element in nav_elements:
68
+ text = text.replace(element, "")
69
  return text.strip()
70
 
71
  def extract_main_content(self, soup: BeautifulSoup) -> str:
72
+ """Extract main content from HTML"""
73
+ # Remove navigation, headers, footers
74
+ for elem in soup.find_all(['nav', 'header', 'footer', 'script', 'style', 'meta', 'link']):
75
+ elem.decompose()
76
+
77
+ # Try to find main content container
78
+ main_content = None
79
+ content_tags = ['article', 'main', '[role="main"]', '.content', '#content', '.post', '.entry']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ for tag in content_tags:
82
+ main_content = soup.select_one(tag)
83
+ if main_content:
84
+ break
 
 
 
 
 
85
 
86
+ if not main_content:
87
+ main_content = soup
88
 
89
+ # Extract text from paragraphs
90
+ paragraphs = main_content.find_all('p')
91
+ if paragraphs:
92
+ return ' '.join(p.get_text(strip=True) for p in paragraphs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ # Fallback to all text if no paragraphs found
95
+ return main_content.get_text(strip=True)
 
 
96
 
97
+ def process_content(self, content: str, html_content: str = None) -> Dict:
98
  """Process content and generate insights"""
99
  try:
100
+ # Clean content
101
+ cleaned_content = self.clean_text(content)
 
 
 
102
 
103
+ # If HTML content is provided, try to extract main content
104
+ if html_content:
105
+ soup = BeautifulSoup(html_content, 'lxml')
106
+ main_content = self.extract_main_content(soup)
107
+ if main_content:
108
+ cleaned_content = self.clean_text(main_content)
109
+
110
+ # Generate summary in chunks if content is too long
111
+ chunks = [cleaned_content[i:i+1024] for i in range(0, len(cleaned_content), 1024)]
112
+ summaries = []
113
+
114
+ for chunk in chunks[:3]: # Process up to 3 chunks to avoid too long processing
115
+ try:
116
+ summary = self.model_manager.models['summarizer'](
117
+ chunk,
118
+ max_length=150,
119
+ min_length=50,
120
+ do_sample=False
121
+ )[0]['summary_text']
122
+ summaries.append(summary)
123
+ except Exception as e:
124
+ logger.warning(f"Error summarizing chunk: {str(e)}")
125
+ continue
126
 
127
+ # Combine summaries
128
+ final_summary = ' '.join(summaries)
129
+
130
+ # Extract key points using bullet points
131
+ key_points = self.model_manager.models['summarizer'](
132
+ cleaned_content[:1024],
133
+ max_length=100,
134
+ min_length=30,
135
+ num_beams=4,
136
+ do_sample=True
137
+ )[0]['summary_text']
138
 
139
  return {
140
+ 'summary': final_summary,
141
+ 'key_points': key_points,
142
+ 'content': cleaned_content
143
  }
144
  except Exception as e:
145
  return {
146
  'summary': f"Error processing content: {str(e)}",
147
+ 'key_points': "",
148
+ 'content': content
149
  }
150
 
151
  class WebSearchEngine:
 
224
  # Get metadata
225
  metadata = self.get_metadata(soup)
226
 
227
+ # Process content with both text and HTML
228
+ processed = self.processor.process_content(
229
+ soup.get_text(),
230
+ html_content=response.text
231
+ )
232
 
233
  return {
234
  'url': url,
 
242
  except Exception as e:
243
  return {'error': f"Error processing {url}: {str(e)}"}
244
 
245
+ def format_results(self, results: List[Dict]) -> Dict:
246
+ """Format search results in a user-friendly way"""
247
+ formatted_insights = []
248
+ formatted_results = []
249
+
250
+ for result in results:
251
+ if 'error' not in result:
252
+ # Format key points
253
+ if result.get('key_points'):
254
+ points = result['key_points'].split('. ')
255
+ formatted_points = [f"• {point.strip()}" for point in points if point.strip()]
256
+ formatted_insights.extend(formatted_points)
257
+
258
+ # Format detailed result
259
+ formatted_result = {
260
+ 'title': result['title'],
261
+ 'url': result['url'],
262
+ 'summary': result['summary'],
263
+ }
264
+ formatted_results.append(formatted_result)
265
+
266
+ # Remove duplicates while preserving order
267
+ formatted_insights = list(dict.fromkeys(formatted_insights))
268
+
269
+ return {
270
+ 'insights': '\n'.join(formatted_insights[:10]), # Top 10 insights
271
+ 'results': formatted_results
272
+ }
273
+
274
  def search_duckduckgo(self, query: str, max_results: int = 5) -> List[Dict]:
275
  """Search DuckDuckGo and parse HTML results"""
276
  search_results = []
 
336
  return {'error': 'No results found'}
337
 
338
  results = []
 
 
339
  for result in search_results:
340
  if 'link' in result:
341
  processed = self.process_url(result['link'])
342
  if 'error' not in processed:
343
  results.append(processed)
 
 
344
  time.sleep(random.uniform(0.5, 1.0))
345
+
346
  if not results:
347
  return {'error': 'Failed to process any search results'}
348
 
349
+ # Format results in a user-friendly way
350
+ formatted = self.format_results(results)
 
 
 
 
 
 
 
 
351
 
352
  return {
353
+ 'results': formatted['results'],
354
+ 'insights': formatted['insights'],
 
355
  'follow_up_questions': [
356
  f"What are the recent breakthroughs in {query}?",
357
  f"How does {query} impact various industries?",