fikird commited on
Commit
f424b55
Β·
1 Parent(s): 03649cb

Improve content processing and result formatting

Browse files
Files changed (2) hide show
  1. app.py +33 -31
  2. search_engine.py +81 -85
app.py CHANGED
@@ -4,6 +4,7 @@ import torch
4
  import os
5
  import logging
6
  import traceback
 
7
 
8
  # Configure logging
9
  logging.basicConfig(
@@ -28,42 +29,46 @@ def safe_search(query, max_results):
28
  return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
29
 
30
  def format_results(results):
31
- """Format search results into a clean markdown output"""
32
  if 'error' in results:
33
  return f"❌ Error: {results['error']}"
34
 
35
  output = []
36
 
37
  # Add insights section
38
- if 'insights' in results:
39
- insights = results['insights']
40
- output.append("# 🎯 Key Insights\n")
41
-
42
- if 'summary' in insights:
43
- output.append(insights['summary'])
44
- output.append("\n")
45
-
46
- if 'key_points' in insights and len(insights['key_points']) > 5:
47
- output.append("\n## πŸ“Œ Additional Points\n")
48
- for point in insights['key_points'][5:]:
49
- output.append(f"β€’ {point}")
50
- output.append("\n")
51
 
52
- # Add sources section
53
- if 'insights' in results and 'sources' in results['insights']:
54
- output.append("\n# πŸ“š Sources\n")
55
- for idx, source in enumerate(results['insights']['sources'], 1):
56
- output.append(f"\n## {idx}. {source['title']}\n")
57
- if 'url' in source:
58
- output.append(f"πŸ”— [View Source]({source['url']})\n")
59
- if 'summary' in source:
60
- output.append(f"\n{source['summary']}\n")
61
 
62
- # Add follow-up questions
63
- if 'follow_up_questions' in results:
64
- output.append("\n# ❓ Suggested Questions\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  for question in results['follow_up_questions']:
66
- output.append(f"β€’ {question}\n")
67
 
68
  return "\n".join(output)
69
 
@@ -90,10 +95,7 @@ def create_demo():
90
  )
91
  search_button = gr.Button("πŸ” Search")
92
 
93
- output = gr.Markdown(
94
- label="Search Results",
95
- show_label=True
96
- )
97
 
98
  search_button.click(
99
  fn=safe_search,
 
4
  import os
5
  import logging
6
  import traceback
7
+ import textwrap
8
 
9
  # Configure logging
10
  logging.basicConfig(
 
29
  return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
30
 
31
  def format_results(results):
32
+ """Format search results into a readable markdown string"""
33
  if 'error' in results:
34
  return f"❌ Error: {results['error']}"
35
 
36
  output = []
37
 
38
  # Add insights section
39
+ if 'insights' in results and results['insights']:
40
+ output.append("# πŸ” Search Results\n")
41
+ output.append("## πŸ’‘ Key Insights")
42
+ output.append(results['insights'])
43
+ output.append("")
 
 
 
 
 
 
 
 
44
 
45
+ # Add key points section
46
+ if 'key_points' in results and results['key_points']:
47
+ output.append("## 🎯 Key Points")
48
+ for point in results['key_points']:
49
+ output.append(f"β€’ {point}")
50
+ output.append("")
 
 
 
51
 
52
+ # Add detailed results section
53
+ if 'results' in results and results['results']:
54
+ output.append("## πŸ“„ Detailed Results")
55
+ for i, result in enumerate(results['results'], 1):
56
+ output.append(f"### {i}. {result['title']}")
57
+ if 'description' in result and result['description']:
58
+ output.append(f"*{result['description']}*")
59
+ if 'summary' in result and result['summary']:
60
+ # Wrap summary text for better readability
61
+ wrapped_summary = textwrap.fill(result['summary'], width=80)
62
+ output.append(f"\n{wrapped_summary}")
63
+ if 'url' in result:
64
+ output.append(f"\nπŸ”— [Read more]({result['url']})")
65
+ output.append("")
66
+
67
+ # Add follow-up questions section
68
+ if 'follow_up_questions' in results and results['follow_up_questions']:
69
+ output.append("## ❓ Follow-up Questions")
70
  for question in results['follow_up_questions']:
71
+ output.append(f"β€’ {question}")
72
 
73
  return "\n".join(output)
74
 
 
95
  )
96
  search_button = gr.Button("πŸ” Search")
97
 
98
+ output = gr.Markdown()
 
 
 
99
 
100
  search_button.click(
101
  fn=safe_search,
search_engine.py CHANGED
@@ -50,52 +50,77 @@ class ContentProcessor:
50
  # Remove extra whitespace
51
  text = ' '.join(text.split())
52
  # Remove common navigation elements
53
- nav_elements = ['Skip to content', 'Search', 'Menu', 'Navigation', 'Subscribe', 'Follow']
54
- for elem in nav_elements:
55
- text = text.replace(elem, '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  return text.strip()
57
-
58
- def extract_key_points(self, content: str, max_points: int = 5) -> List[str]:
59
- """Extract key points from content using AI"""
60
- try:
61
- # Split content into smaller chunks for processing
62
- chunks = [content[i:i + 1024] for i in range(0, len(content), 1024)]
63
- all_points = []
64
 
65
- for chunk in chunks:
66
- summary = self.model_manager.models['summarizer'](
67
- chunk,
68
- max_length=100,
69
- min_length=30,
70
- do_sample=False
71
- )[0]['summary_text']
72
-
73
- # Split summary into sentences
74
- points = [p.strip() for p in summary.split('.') if p.strip()]
75
- all_points.extend(points)
76
 
77
- # Return unique points, prioritizing longer, more informative ones
78
- unique_points = list(set(all_points))
79
- unique_points.sort(key=len, reverse=True)
80
- return unique_points[:max_points]
 
 
 
 
 
 
 
81
 
82
- except Exception as e:
83
- logger.error(f"Error extracting key points: {str(e)}")
84
- return []
85
 
86
- def process_content(self, content: str) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  """Process content and generate insights"""
88
  try:
89
- # Clean the content
 
 
 
 
90
  cleaned_content = self.clean_text(content)
91
 
92
- if not cleaned_content:
93
- return {
94
- 'summary': "No meaningful content found",
95
- 'content': content,
96
- 'key_points': [],
97
- 'topics': []
98
- }
99
 
100
  # Generate summary
101
  summary = self.model_manager.models['summarizer'](
@@ -105,30 +130,16 @@ class ContentProcessor:
105
  do_sample=False
106
  )[0]['summary_text']
107
 
108
- # Extract key points
109
- key_points = self.extract_key_points(cleaned_content)
110
-
111
- # Extract main topics using embeddings
112
- embeddings = self.model_manager.models['embeddings'].embed_documents(
113
- [cleaned_content[:2048]]
114
- )
115
-
116
- # You could add topic modeling here if needed
117
-
118
  return {
119
  'summary': summary,
120
- 'content': cleaned_content,
121
  'key_points': key_points,
122
- 'topics': [] # Reserved for future topic modeling
123
  }
124
-
125
  except Exception as e:
126
- logger.error(f"Error processing content: {str(e)}")
127
  return {
128
  'summary': f"Error processing content: {str(e)}",
129
- 'content': content,
130
  'key_points': [],
131
- 'topics': []
132
  }
133
 
134
  class WebSearchEngine:
@@ -204,28 +215,19 @@ class WebSearchEngine:
204
  response = self.safe_get(url)
205
  soup = BeautifulSoup(response.text, 'lxml')
206
 
207
- # Extract text content
208
- for script in soup(["script", "style"]):
209
- script.decompose()
210
- text = soup.get_text()
211
- lines = (line.strip() for line in text.splitlines())
212
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
213
- content = ' '.join(chunk for chunk in chunks if chunk)
214
 
215
  # Get metadata
216
  metadata = self.get_metadata(soup)
217
 
218
- # Process content
219
- processed = self.processor.process_content(content)
220
-
221
  return {
222
  'url': url,
223
  'title': metadata['title'],
224
  'description': metadata['description'],
225
  'summary': processed['summary'],
226
- 'content': processed['content'],
227
  'key_points': processed['key_points'],
228
- 'topics': processed['topics']
229
  }
230
 
231
  except Exception as e:
@@ -306,35 +308,29 @@ class WebSearchEngine:
306
  if 'key_points' in processed:
307
  all_key_points.extend(processed['key_points'])
308
  time.sleep(random.uniform(0.5, 1.0))
309
-
310
  if not results:
311
  return {'error': 'Failed to process any search results'}
312
 
313
- # Combine and deduplicate key points
314
- unique_points = list(set(all_key_points))
315
- unique_points.sort(key=len, reverse=True)
316
 
317
- # Generate comprehensive insights
318
- insights = {
319
- 'summary': "Key Findings:\n" + "\n".join(f"β€’ {point}" for point in unique_points[:5]),
320
- 'key_points': unique_points[:10],
321
- 'sources': [
322
- {
323
- 'title': r.get('title', 'Untitled'),
324
- 'url': r.get('url', ''),
325
- 'summary': r.get('summary', '')
326
- }
327
- for r in results
328
- ]
329
- }
330
 
331
  return {
332
  'results': results,
333
  'insights': insights,
 
334
  'follow_up_questions': [
335
- f"What are the practical applications of {query}?",
336
- f"How does {query} impact current technology?",
337
- f"What are the future prospects for {query}?"
338
  ]
339
  }
340
 
 
50
  # Remove extra whitespace
51
  text = ' '.join(text.split())
52
  # Remove common navigation elements
53
+ nav_elements = [
54
+ "skip to content",
55
+ "skip to navigation",
56
+ "search",
57
+ "menu",
58
+ "submit",
59
+ "subscribe",
60
+ "browse",
61
+ "explore",
62
+ "more",
63
+ "all press releases",
64
+ "media resources",
65
+ "media contacts",
66
+ "investor relations"
67
+ ]
68
+ for element in nav_elements:
69
+ text = text.replace(element.lower(), "")
70
  return text.strip()
71
+
72
+ def extract_main_content(self, soup: BeautifulSoup) -> str:
73
+ """Extract main content from HTML"""
74
+ # Remove navigation, header, footer, and sidebar elements
75
+ for elem in soup.find_all(['nav', 'header', 'footer', 'aside']):
76
+ elem.decompose()
 
77
 
78
+ # Remove common non-content elements
79
+ for elem in soup.find_all(class_=['menu', 'navigation', 'sidebar', 'footer', 'header']):
80
+ elem.decompose()
 
 
 
 
 
 
 
 
81
 
82
+ # Extract text from remaining elements
83
+ content_elements = []
84
+
85
+ # Look for main content containers
86
+ main_content = soup.find(['main', 'article', 'div'], class_=['content', 'main', 'article'])
87
+ if main_content:
88
+ content_elements.append(main_content.get_text())
89
+ else:
90
+ # If no main content found, look for paragraphs
91
+ paragraphs = soup.find_all('p')
92
+ content_elements.extend(p.get_text() for p in paragraphs)
93
 
94
+ return ' '.join(content_elements)
 
 
95
 
96
+ def extract_key_points(self, text: str) -> List[str]:
97
+ """Extract key points from text"""
98
+ # Split into sentences
99
+ sentences = text.split('.')
100
+ key_points = []
101
+
102
+ for sentence in sentences:
103
+ # Look for sentences with important keywords
104
+ keywords = ['quantum', 'computer', 'research', 'development', 'breakthrough', 'innovation']
105
+ if any(keyword in sentence.lower() for keyword in keywords):
106
+ cleaned = sentence.strip()
107
+ if cleaned and len(cleaned) > 20: # Avoid very short sentences
108
+ key_points.append(cleaned)
109
+
110
+ return key_points[:5] # Return top 5 key points
111
+
112
+ def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
113
  """Process content and generate insights"""
114
  try:
115
+ # Extract main content if HTML is available
116
+ if soup:
117
+ content = self.extract_main_content(soup)
118
+
119
+ # Clean the text
120
  cleaned_content = self.clean_text(content)
121
 
122
+ # Extract key points
123
+ key_points = self.extract_key_points(cleaned_content)
 
 
 
 
 
124
 
125
  # Generate summary
126
  summary = self.model_manager.models['summarizer'](
 
130
  do_sample=False
131
  )[0]['summary_text']
132
 
 
 
 
 
 
 
 
 
 
 
133
  return {
134
  'summary': summary,
 
135
  'key_points': key_points,
136
+ 'content': cleaned_content
137
  }
 
138
  except Exception as e:
 
139
  return {
140
  'summary': f"Error processing content: {str(e)}",
 
141
  'key_points': [],
142
+ 'content': content
143
  }
144
 
145
  class WebSearchEngine:
 
215
  response = self.safe_get(url)
216
  soup = BeautifulSoup(response.text, 'lxml')
217
 
218
+ # Process content with HTML context
219
+ processed = self.processor.process_content(response.text, soup)
 
 
 
 
 
220
 
221
  # Get metadata
222
  metadata = self.get_metadata(soup)
223
 
 
 
 
224
  return {
225
  'url': url,
226
  'title': metadata['title'],
227
  'description': metadata['description'],
228
  'summary': processed['summary'],
 
229
  'key_points': processed['key_points'],
230
+ 'content': processed['content']
231
  }
232
 
233
  except Exception as e:
 
308
  if 'key_points' in processed:
309
  all_key_points.extend(processed['key_points'])
310
  time.sleep(random.uniform(0.5, 1.0))
311
+
312
  if not results:
313
  return {'error': 'Failed to process any search results'}
314
 
315
+ # Combine all summaries and key points
316
+ combined_summary = " ".join([r['summary'] for r in results if 'summary' in r])
 
317
 
318
+ # Generate final insights
319
+ insights = self.processor.model_manager.models['summarizer'](
320
+ combined_summary,
321
+ max_length=200,
322
+ min_length=100,
323
+ do_sample=False
324
+ )[0]['summary_text']
 
 
 
 
 
 
325
 
326
  return {
327
  'results': results,
328
  'insights': insights,
329
+ 'key_points': list(set(all_key_points)), # Remove duplicates
330
  'follow_up_questions': [
331
+ f"What are the key differences between {query} and related topics?",
332
+ f"Can you explain {query} in simple terms?",
333
+ f"What are the latest developments in {query}?"
334
  ]
335
  }
336