fikird commited on
Commit
8e83c5f
Β·
1 Parent(s): f424b55

Enhance content processing with better summarization and topic extraction

Browse files
Files changed (2) hide show
  1. app.py +41 -36
  2. search_engine.py +91 -73
app.py CHANGED
@@ -4,7 +4,6 @@ import torch
4
  import os
5
  import logging
6
  import traceback
7
- import textwrap
8
 
9
  # Configure logging
10
  logging.basicConfig(
@@ -29,48 +28,54 @@ def safe_search(query, max_results):
29
  return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
30
 
31
  def format_results(results):
32
- """Format search results into a readable markdown string"""
33
- if 'error' in results:
34
- return f"❌ Error: {results['error']}"
35
-
36
- output = []
37
 
38
  # Add insights section
39
- if 'insights' in results and results['insights']:
40
- output.append("# πŸ” Search Results\n")
41
- output.append("## πŸ’‘ Key Insights")
42
- output.append(results['insights'])
43
- output.append("")
44
 
45
- # Add key points section
46
- if 'key_points' in results and results['key_points']:
47
- output.append("## 🎯 Key Points")
48
- for point in results['key_points']:
49
- output.append(f"β€’ {point}")
50
- output.append("")
 
51
 
52
- # Add detailed results section
53
- if 'results' in results and results['results']:
54
- output.append("## πŸ“„ Detailed Results")
55
  for i, result in enumerate(results['results'], 1):
56
- output.append(f"### {i}. {result['title']}")
57
- if 'description' in result and result['description']:
58
- output.append(f"*{result['description']}*")
59
- if 'summary' in result and result['summary']:
60
- # Wrap summary text for better readability
61
- wrapped_summary = textwrap.fill(result['summary'], width=80)
62
- output.append(f"\n{wrapped_summary}")
63
  if 'url' in result:
64
- output.append(f"\nπŸ”— [Read more]({result['url']})")
65
- output.append("")
66
-
67
- # Add follow-up questions section
68
- if 'follow_up_questions' in results and results['follow_up_questions']:
69
- output.append("## ❓ Follow-up Questions")
70
- for question in results['follow_up_questions']:
71
- output.append(f"β€’ {question}")
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- return "\n".join(output)
74
 
75
  def create_demo():
76
  """Create the Gradio interface"""
 
4
  import os
5
  import logging
6
  import traceback
 
7
 
8
  # Configure logging
9
  logging.basicConfig(
 
28
  return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
29
 
30
  def format_results(results):
31
+ """Format search results for display"""
32
+ if not results or not results.get('results'):
33
+ return "# ⚠️ No Results\nNo search results were found. Please try a different query."
34
+
35
+ formatted = f"# πŸ” Search Results\n\n"
36
 
37
  # Add insights section
38
+ if 'insights' in results:
39
+ formatted += f"## πŸ’‘ Key Insights\n{results['insights']}\n\n"
 
 
 
40
 
41
+ # Add follow-up questions
42
+ if 'follow_up_questions' in results:
43
+ formatted += "## ❓ Follow-up Questions\n"
44
+ for q in results['follow_up_questions']:
45
+ if q and q.strip():
46
+ formatted += f"- {q.strip()}\n"
47
+ formatted += "\n"
48
 
49
+ # Add main results
50
+ if 'results' in results:
51
+ formatted += "## πŸ“„ Detailed Results\n\n"
52
  for i, result in enumerate(results['results'], 1):
53
+ if not isinstance(result, dict):
54
+ continue
55
+
56
+ formatted += f"### {i}. "
 
 
 
57
  if 'url' in result:
58
+ title = result.get('title', 'Untitled')
59
+ formatted += f"[{title}]({result['url']})\n"
60
+ if 'summary' in result:
61
+ formatted += f"\n{result['summary']}\n\n"
62
+
63
+ # Add similar chunks if available
64
+ if 'similar_chunks' in results:
65
+ formatted += "## πŸ” Related Content\n\n"
66
+ for i, chunk in enumerate(results['similar_chunks'], 1):
67
+ if not isinstance(chunk, dict):
68
+ continue
69
+
70
+ formatted += f"### Related {i}\n"
71
+ if 'metadata' in chunk:
72
+ meta = chunk['metadata']
73
+ if 'title' in meta and 'url' in meta:
74
+ formatted += f"From [{meta['title']}]({meta['url']})\n"
75
+ if 'content' in chunk:
76
+ formatted += f"\n{chunk['content'][:200]}...\n\n"
77
 
78
+ return formatted
79
 
80
  def create_demo():
81
  """Create the Gradio interface"""
search_engine.py CHANGED
@@ -47,98 +47,97 @@ class ContentProcessor:
47
 
48
  def clean_text(self, text: str) -> str:
49
  """Clean and normalize text content"""
50
- # Remove extra whitespace
 
 
 
 
51
  text = ' '.join(text.split())
 
52
  # Remove common navigation elements
53
- nav_elements = [
54
  "skip to content",
55
- "skip to navigation",
56
  "search",
57
  "menu",
58
- "submit",
59
  "subscribe",
60
- "browse",
61
- "explore",
62
- "more",
63
- "all press releases",
64
- "media resources",
65
- "media contacts",
66
- "investor relations"
67
  ]
68
- for element in nav_elements:
69
- text = text.replace(element.lower(), "")
70
- return text.strip()
 
71
 
72
- def extract_main_content(self, soup: BeautifulSoup) -> str:
73
- """Extract main content from HTML"""
74
- # Remove navigation, header, footer, and sidebar elements
75
- for elem in soup.find_all(['nav', 'header', 'footer', 'aside']):
76
- elem.decompose()
77
 
78
- # Remove common non-content elements
79
- for elem in soup.find_all(class_=['menu', 'navigation', 'sidebar', 'footer', 'header']):
80
- elem.decompose()
 
 
 
 
 
 
 
 
 
81
 
82
- # Extract text from remaining elements
83
- content_elements = []
84
-
85
- # Look for main content containers
86
- main_content = soup.find(['main', 'article', 'div'], class_=['content', 'main', 'article'])
87
- if main_content:
88
- content_elements.append(main_content.get_text())
89
- else:
90
- # If no main content found, look for paragraphs
91
- paragraphs = soup.find_all('p')
92
- content_elements.extend(p.get_text() for p in paragraphs)
93
 
94
- return ' '.join(content_elements)
95
-
96
- def extract_key_points(self, text: str) -> List[str]:
97
- """Extract key points from text"""
98
- # Split into sentences
99
- sentences = text.split('.')
100
- key_points = []
101
-
102
- for sentence in sentences:
103
- # Look for sentences with important keywords
104
- keywords = ['quantum', 'computer', 'research', 'development', 'breakthrough', 'innovation']
105
- if any(keyword in sentence.lower() for keyword in keywords):
106
- cleaned = sentence.strip()
107
- if cleaned and len(cleaned) > 20: # Avoid very short sentences
108
- key_points.append(cleaned)
109
-
110
- return key_points[:5] # Return top 5 key points
111
 
112
- def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
113
  """Process content and generate insights"""
114
  try:
115
- # Extract main content if HTML is available
116
- if soup:
117
- content = self.extract_main_content(soup)
118
-
119
  # Clean the text
120
- cleaned_content = self.clean_text(content)
121
 
122
  # Extract key points
123
- key_points = self.extract_key_points(cleaned_content)
124
 
125
- # Generate summary
126
  summary = self.model_manager.models['summarizer'](
127
- cleaned_content[:1024],
128
  max_length=150,
129
  min_length=50,
130
  do_sample=False
131
  )[0]['summary_text']
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  return {
134
  'summary': summary,
135
  'key_points': key_points,
136
- 'content': cleaned_content
 
137
  }
 
138
  except Exception as e:
139
  return {
140
  'summary': f"Error processing content: {str(e)}",
141
  'key_points': [],
 
142
  'content': content
143
  }
144
 
@@ -215,18 +214,27 @@ class WebSearchEngine:
215
  response = self.safe_get(url)
216
  soup = BeautifulSoup(response.text, 'lxml')
217
 
218
- # Process content with HTML context
219
- processed = self.processor.process_content(response.text, soup)
 
 
 
 
 
220
 
221
  # Get metadata
222
  metadata = self.get_metadata(soup)
223
 
 
 
 
224
  return {
225
  'url': url,
226
  'title': metadata['title'],
227
  'description': metadata['description'],
228
  'summary': processed['summary'],
229
  'key_points': processed['key_points'],
 
230
  'content': processed['content']
231
  }
232
 
@@ -299,38 +307,48 @@ class WebSearchEngine:
299
 
300
  results = []
301
  all_key_points = []
 
302
 
303
  for result in search_results:
304
  if 'link' in result:
305
  processed = self.process_url(result['link'])
306
  if 'error' not in processed:
307
  results.append(processed)
 
308
  if 'key_points' in processed:
309
  all_key_points.extend(processed['key_points'])
 
 
310
  time.sleep(random.uniform(0.5, 1.0))
311
-
312
  if not results:
313
  return {'error': 'Failed to process any search results'}
314
 
315
- # Combine all summaries and key points
316
- combined_summary = " ".join([r['summary'] for r in results if 'summary' in r])
317
 
318
- # Generate final insights
319
- insights = self.processor.model_manager.models['summarizer'](
320
- combined_summary,
321
  max_length=200,
322
  min_length=100,
323
  do_sample=False
324
  )[0]['summary_text']
325
 
 
 
 
326
  return {
327
  'results': results,
328
- 'insights': insights,
329
- 'key_points': list(set(all_key_points)), # Remove duplicates
 
 
 
330
  'follow_up_questions': [
331
- f"What are the key differences between {query} and related topics?",
332
- f"Can you explain {query} in simple terms?",
333
- f"What are the latest developments in {query}?"
334
  ]
335
  }
336
 
 
47
 
48
  def clean_text(self, text: str) -> str:
49
  """Clean and normalize text content"""
50
+ # Remove extra whitespace and normalize
51
+ lines = [line.strip() for line in text.splitlines()]
52
+ text = ' '.join(line for line in lines if line)
53
+
54
+ # Remove redundant spaces
55
  text = ' '.join(text.split())
56
+
57
  # Remove common navigation elements
58
+ nav_patterns = [
59
  "skip to content",
 
60
  "search",
61
  "menu",
62
+ "navigation",
63
  "subscribe",
64
+ "sign in",
65
+ "log in"
 
 
 
 
 
66
  ]
67
+ for pattern in nav_patterns:
68
+ text = text.replace(pattern, "")
69
+
70
+ return text
71
 
72
+ def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
73
+ """Extract key points from text using the summarizer"""
74
+ try:
75
+ # Split text into chunks of ~1000 characters
76
+ chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]
77
 
78
+ all_points = []
79
+ for chunk in chunks[:3]: # Process first 3 chunks only
80
+ summary = self.model_manager.models['summarizer'](
81
+ chunk,
82
+ max_length=100,
83
+ min_length=30,
84
+ do_sample=False
85
+ )[0]['summary_text']
86
+
87
+ # Split into sentences and add as points
88
+ sentences = [s.strip() for s in summary.split('.') if s.strip()]
89
+ all_points.extend(sentences)
90
 
91
+ # Return unique points, limited to max_points
92
+ unique_points = list(dict.fromkeys(all_points))
93
+ return unique_points[:max_points]
 
 
 
 
 
 
 
 
94
 
95
+ except Exception as e:
96
+ logger.error(f"Error extracting key points: {str(e)}")
97
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
+ def process_content(self, content: str) -> Dict:
100
  """Process content and generate insights"""
101
  try:
 
 
 
 
102
  # Clean the text
103
+ cleaned_text = self.clean_text(content)
104
 
105
  # Extract key points
106
+ key_points = self.extract_key_points(cleaned_text)
107
 
108
+ # Generate a concise summary
109
  summary = self.model_manager.models['summarizer'](
110
+ cleaned_text[:1024],
111
  max_length=150,
112
  min_length=50,
113
  do_sample=False
114
  )[0]['summary_text']
115
 
116
+ # Extract potential topics/keywords
117
+ topics = []
118
+ common_topics = [
119
+ "quantum computing", "quantum processors", "quantum bits",
120
+ "quantum algorithms", "quantum supremacy", "quantum advantage",
121
+ "error correction", "quantum hardware", "quantum software",
122
+ "quantum research", "quantum applications"
123
+ ]
124
+
125
+ for topic in common_topics:
126
+ if topic.lower() in cleaned_text.lower():
127
+ topics.append(topic)
128
+
129
  return {
130
  'summary': summary,
131
  'key_points': key_points,
132
+ 'topics': topics[:5], # Limit to top 5 topics
133
+ 'content': cleaned_text
134
  }
135
+
136
  except Exception as e:
137
  return {
138
  'summary': f"Error processing content: {str(e)}",
139
  'key_points': [],
140
+ 'topics': [],
141
  'content': content
142
  }
143
 
 
214
  response = self.safe_get(url)
215
  soup = BeautifulSoup(response.text, 'lxml')
216
 
217
+ # Extract text content
218
+ for script in soup(["script", "style"]):
219
+ script.decompose()
220
+ text = soup.get_text()
221
+ lines = (line.strip() for line in text.splitlines())
222
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
223
+ content = ' '.join(chunk for chunk in chunks if chunk)
224
 
225
  # Get metadata
226
  metadata = self.get_metadata(soup)
227
 
228
+ # Process content
229
+ processed = self.processor.process_content(content)
230
+
231
  return {
232
  'url': url,
233
  'title': metadata['title'],
234
  'description': metadata['description'],
235
  'summary': processed['summary'],
236
  'key_points': processed['key_points'],
237
+ 'topics': processed['topics'],
238
  'content': processed['content']
239
  }
240
 
 
307
 
308
  results = []
309
  all_key_points = []
310
+ all_topics = set()
311
 
312
  for result in search_results:
313
  if 'link' in result:
314
  processed = self.process_url(result['link'])
315
  if 'error' not in processed:
316
  results.append(processed)
317
+ # Collect key points and topics
318
  if 'key_points' in processed:
319
  all_key_points.extend(processed['key_points'])
320
+ if 'topics' in processed:
321
+ all_topics.update(processed.get('topics', []))
322
  time.sleep(random.uniform(0.5, 1.0))
323
+
324
  if not results:
325
  return {'error': 'Failed to process any search results'}
326
 
327
+ # Combine all summaries
328
+ all_summaries = " ".join([r['summary'] for r in results if 'summary' in r])
329
 
330
+ # Generate a meta-summary of all content
331
+ meta_summary = self.processor.model_manager.models['summarizer'](
332
+ all_summaries[:1024],
333
  max_length=200,
334
  min_length=100,
335
  do_sample=False
336
  )[0]['summary_text']
337
 
338
+ # Get unique key points
339
+ unique_key_points = list(dict.fromkeys(all_key_points))
340
+
341
  return {
342
  'results': results,
343
+ 'insights': {
344
+ 'summary': meta_summary,
345
+ 'key_points': unique_key_points[:7], # Top 7 key points
346
+ 'topics': list(all_topics)[:5] # Top 5 topics
347
+ },
348
  'follow_up_questions': [
349
+ f"What are the recent breakthroughs in {', '.join(list(all_topics)[:2])}?",
350
+ f"How do these developments impact the future of quantum computing?",
351
+ f"What are the practical applications of these quantum computing advances?"
352
  ]
353
  }
354