fikird commited on
Commit
03649cb
Β·
1 Parent(s): 636f8ae

Improve content processing and result formatting

Browse files
Files changed (2) hide show
  1. app.py +35 -42
  2. search_engine.py +74 -60
app.py CHANGED
@@ -28,54 +28,44 @@ def safe_search(query, max_results):
28
  return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
29
 
30
  def format_results(results):
31
- """Format search results for display"""
32
- if not results or not results.get('results'):
33
- return "# ⚠️ No Results\nNo search results were found. Please try a different query."
34
-
35
- formatted = f"# πŸ” Search Results\n\n"
36
 
37
  # Add insights section
38
  if 'insights' in results:
39
- formatted += f"## πŸ’‘ Key Insights\n{results['insights']}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  # Add follow-up questions
42
  if 'follow_up_questions' in results:
43
- formatted += "## ❓ Follow-up Questions\n"
44
- for q in results['follow_up_questions']:
45
- if q and q.strip():
46
- formatted += f"- {q.strip()}\n"
47
- formatted += "\n"
48
-
49
- # Add main results
50
- if 'results' in results:
51
- formatted += "## πŸ“„ Detailed Results\n\n"
52
- for i, result in enumerate(results['results'], 1):
53
- if not isinstance(result, dict):
54
- continue
55
-
56
- formatted += f"### {i}. "
57
- if 'url' in result:
58
- title = result.get('title', 'Untitled')
59
- formatted += f"[{title}]({result['url']})\n"
60
- if 'summary' in result:
61
- formatted += f"\n{result['summary']}\n\n"
62
-
63
- # Add similar chunks if available
64
- if 'similar_chunks' in results:
65
- formatted += "## πŸ” Related Content\n\n"
66
- for i, chunk in enumerate(results['similar_chunks'], 1):
67
- if not isinstance(chunk, dict):
68
- continue
69
-
70
- formatted += f"### Related {i}\n"
71
- if 'metadata' in chunk:
72
- meta = chunk['metadata']
73
- if 'title' in meta and 'url' in meta:
74
- formatted += f"From [{meta['title']}]({meta['url']})\n"
75
- if 'content' in chunk:
76
- formatted += f"\n{chunk['content'][:200]}...\n\n"
77
 
78
- return formatted
79
 
80
  def create_demo():
81
  """Create the Gradio interface"""
@@ -100,7 +90,10 @@ def create_demo():
100
  )
101
  search_button = gr.Button("πŸ” Search")
102
 
103
- output = gr.Markdown()
 
 
 
104
 
105
  search_button.click(
106
  fn=safe_search,
 
28
  return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
29
 
30
  def format_results(results):
31
+ """Format search results into a clean markdown output"""
32
+ if 'error' in results:
33
+ return f"❌ Error: {results['error']}"
34
+
35
+ output = []
36
 
37
  # Add insights section
38
  if 'insights' in results:
39
+ insights = results['insights']
40
+ output.append("# 🎯 Key Insights\n")
41
+
42
+ if 'summary' in insights:
43
+ output.append(insights['summary'])
44
+ output.append("\n")
45
+
46
+ if 'key_points' in insights and len(insights['key_points']) > 5:
47
+ output.append("\n## πŸ“Œ Additional Points\n")
48
+ for point in insights['key_points'][5:]:
49
+ output.append(f"β€’ {point}")
50
+ output.append("\n")
51
+
52
+ # Add sources section
53
+ if 'insights' in results and 'sources' in results['insights']:
54
+ output.append("\n# πŸ“š Sources\n")
55
+ for idx, source in enumerate(results['insights']['sources'], 1):
56
+ output.append(f"\n## {idx}. {source['title']}\n")
57
+ if 'url' in source:
58
+ output.append(f"πŸ”— [View Source]({source['url']})\n")
59
+ if 'summary' in source:
60
+ output.append(f"\n{source['summary']}\n")
61
 
62
  # Add follow-up questions
63
  if 'follow_up_questions' in results:
64
+ output.append("\n# ❓ Suggested Questions\n")
65
+ for question in results['follow_up_questions']:
66
+ output.append(f"β€’ {question}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ return "\n".join(output)
69
 
70
  def create_demo():
71
  """Create the Gradio interface"""
 
90
  )
91
  search_button = gr.Button("πŸ” Search")
92
 
93
+ output = gr.Markdown(
94
+ label="Search Results",
95
+ show_label=True
96
+ )
97
 
98
  search_button.click(
99
  fn=safe_search,
search_engine.py CHANGED
@@ -49,77 +49,86 @@ class ContentProcessor:
49
  """Clean and normalize text content"""
50
  # Remove extra whitespace
51
  text = ' '.join(text.split())
52
- # Remove redundant headers and navigation text
53
- common_headers = ['skip to content', 'search', 'menu', 'navigation', 'subscribe']
54
- lines = []
55
- for line in text.split('\n'):
56
- line = line.strip().lower()
57
- if not any(header in line for header in common_headers) and len(line) > 20:
58
- lines.append(line)
59
- return ' '.join(lines)
60
 
61
- def extract_key_points(self, content: str) -> List[str]:
62
  """Extract key points from content using AI"""
63
  try:
64
- # Split content into chunks for processing
65
- chunks = [content[i:i+1024] for i in range(0, len(content), 1024)]
66
- key_points = []
67
 
68
  for chunk in chunks:
69
- # Generate focused summary for each chunk
70
  summary = self.model_manager.models['summarizer'](
71
  chunk,
72
- max_length=150,
73
- min_length=50,
74
- do_sample=False,
75
- num_beams=4,
76
- length_penalty=2.0,
77
- early_stopping=True
78
  )[0]['summary_text']
79
 
80
- key_points.append(summary)
 
 
 
 
 
 
 
81
 
82
- return key_points
83
  except Exception as e:
84
  logger.error(f"Error extracting key points: {str(e)}")
85
  return []
86
 
87
- def process_content(self, content: str, title: str = "", description: str = "") -> Dict:
88
  """Process content and generate insights"""
89
  try:
90
  # Clean the content
91
  cleaned_content = self.clean_text(content)
92
 
93
- # Combine title and description with content for context
94
- if title:
95
- cleaned_content = f"{title}. {cleaned_content}"
96
- if description:
97
- cleaned_content = f"{description}. {cleaned_content}"
 
 
 
 
 
 
 
 
 
 
98
 
99
  # Extract key points
100
  key_points = self.extract_key_points(cleaned_content)
101
 
102
- # Generate overall summary
103
- summary = self.model_manager.models['summarizer'](
104
- ' '.join(key_points)[:1024],
105
- max_length=200,
106
- min_length=100,
107
- do_sample=False,
108
- num_beams=4,
109
- length_penalty=2.0,
110
- early_stopping=True
111
- )[0]['summary_text']
112
 
113
  return {
114
  'summary': summary,
 
115
  'key_points': key_points,
116
- 'content': cleaned_content
117
  }
 
118
  except Exception as e:
 
119
  return {
120
  'summary': f"Error processing content: {str(e)}",
 
121
  'key_points': [],
122
- 'content': content
123
  }
124
 
125
  class WebSearchEngine:
@@ -207,15 +216,16 @@ class WebSearchEngine:
207
  metadata = self.get_metadata(soup)
208
 
209
  # Process content
210
- processed = self.processor.process_content(content, metadata['title'], metadata['description'])
211
 
212
  return {
213
  'url': url,
214
  'title': metadata['title'],
215
  'description': metadata['description'],
216
  'summary': processed['summary'],
 
217
  'key_points': processed['key_points'],
218
- 'content': processed['content']
219
  }
220
 
221
  except Exception as e:
@@ -292,36 +302,40 @@ class WebSearchEngine:
292
  if 'link' in result:
293
  processed = self.process_url(result['link'])
294
  if 'error' not in processed:
295
- # Add original search snippet
296
- processed['snippet'] = result.get('snippet', '')
297
  results.append(processed)
298
- # Collect key points
299
  if 'key_points' in processed:
300
  all_key_points.extend(processed['key_points'])
301
  time.sleep(random.uniform(0.5, 1.0))
302
-
303
  if not results:
304
  return {'error': 'Failed to process any search results'}
305
 
306
- # Generate comprehensive insights
307
- insights = []
308
- if all_key_points:
309
- # Group similar points and remove duplicates
310
- unique_points = list(set(all_key_points))
311
- insights = self.processor.extract_key_points(' '.join(unique_points))
312
 
313
- # Generate relevant follow-up questions
314
- follow_up_questions = [
315
- f"What are the practical applications of {query}?",
316
- f"How does {query} impact industry and research?",
317
- f"What challenges and limitations exist in {query}?",
318
- f"What future developments are expected in {query}?"
319
- ]
 
 
 
 
 
 
320
 
321
  return {
322
  'results': results,
323
- 'insights': insights if insights else ["No comprehensive insights available."],
324
- 'follow_up_questions': follow_up_questions
 
 
 
 
325
  }
326
 
327
  except Exception as e:
 
49
  """Clean and normalize text content"""
50
  # Remove extra whitespace
51
  text = ' '.join(text.split())
52
+ # Remove common navigation elements
53
+ nav_elements = ['Skip to content', 'Search', 'Menu', 'Navigation', 'Subscribe', 'Follow']
54
+ for elem in nav_elements:
55
+ text = text.replace(elem, '')
56
+ return text.strip()
 
 
 
57
 
58
+ def extract_key_points(self, content: str, max_points: int = 5) -> List[str]:
59
  """Extract key points from content using AI"""
60
  try:
61
+ # Split content into smaller chunks for processing
62
+ chunks = [content[i:i + 1024] for i in range(0, len(content), 1024)]
63
+ all_points = []
64
 
65
  for chunk in chunks:
 
66
  summary = self.model_manager.models['summarizer'](
67
  chunk,
68
+ max_length=100,
69
+ min_length=30,
70
+ do_sample=False
 
 
 
71
  )[0]['summary_text']
72
 
73
+ # Split summary into sentences
74
+ points = [p.strip() for p in summary.split('.') if p.strip()]
75
+ all_points.extend(points)
76
+
77
+ # Return unique points, prioritizing longer, more informative ones
78
+ unique_points = list(set(all_points))
79
+ unique_points.sort(key=len, reverse=True)
80
+ return unique_points[:max_points]
81
 
 
82
  except Exception as e:
83
  logger.error(f"Error extracting key points: {str(e)}")
84
  return []
85
 
86
+ def process_content(self, content: str) -> Dict:
87
  """Process content and generate insights"""
88
  try:
89
  # Clean the content
90
  cleaned_content = self.clean_text(content)
91
 
92
+ if not cleaned_content:
93
+ return {
94
+ 'summary': "No meaningful content found",
95
+ 'content': content,
96
+ 'key_points': [],
97
+ 'topics': []
98
+ }
99
+
100
+ # Generate summary
101
+ summary = self.model_manager.models['summarizer'](
102
+ cleaned_content[:1024],
103
+ max_length=150,
104
+ min_length=50,
105
+ do_sample=False
106
+ )[0]['summary_text']
107
 
108
  # Extract key points
109
  key_points = self.extract_key_points(cleaned_content)
110
 
111
+ # Extract main topics using embeddings
112
+ embeddings = self.model_manager.models['embeddings'].embed_documents(
113
+ [cleaned_content[:2048]]
114
+ )
115
+
116
+ # You could add topic modeling here if needed
 
 
 
 
117
 
118
  return {
119
  'summary': summary,
120
+ 'content': cleaned_content,
121
  'key_points': key_points,
122
+ 'topics': [] # Reserved for future topic modeling
123
  }
124
+
125
  except Exception as e:
126
+ logger.error(f"Error processing content: {str(e)}")
127
  return {
128
  'summary': f"Error processing content: {str(e)}",
129
+ 'content': content,
130
  'key_points': [],
131
+ 'topics': []
132
  }
133
 
134
  class WebSearchEngine:
 
216
  metadata = self.get_metadata(soup)
217
 
218
  # Process content
219
+ processed = self.processor.process_content(content)
220
 
221
  return {
222
  'url': url,
223
  'title': metadata['title'],
224
  'description': metadata['description'],
225
  'summary': processed['summary'],
226
+ 'content': processed['content'],
227
  'key_points': processed['key_points'],
228
+ 'topics': processed['topics']
229
  }
230
 
231
  except Exception as e:
 
302
  if 'link' in result:
303
  processed = self.process_url(result['link'])
304
  if 'error' not in processed:
 
 
305
  results.append(processed)
 
306
  if 'key_points' in processed:
307
  all_key_points.extend(processed['key_points'])
308
  time.sleep(random.uniform(0.5, 1.0))
309
+
310
  if not results:
311
  return {'error': 'Failed to process any search results'}
312
 
313
+ # Combine and deduplicate key points
314
+ unique_points = list(set(all_key_points))
315
+ unique_points.sort(key=len, reverse=True)
 
 
 
316
 
317
+ # Generate comprehensive insights
318
+ insights = {
319
+ 'summary': "Key Findings:\n" + "\n".join(f"β€’ {point}" for point in unique_points[:5]),
320
+ 'key_points': unique_points[:10],
321
+ 'sources': [
322
+ {
323
+ 'title': r.get('title', 'Untitled'),
324
+ 'url': r.get('url', ''),
325
+ 'summary': r.get('summary', '')
326
+ }
327
+ for r in results
328
+ ]
329
+ }
330
 
331
  return {
332
  'results': results,
333
+ 'insights': insights,
334
+ 'follow_up_questions': [
335
+ f"What are the practical applications of {query}?",
336
+ f"How does {query} impact current technology?",
337
+ f"What are the future prospects for {query}?"
338
+ ]
339
  }
340
 
341
  except Exception as e: