Spaces:

tensor-boy
/

aiws

Build error

App Files Files Community

fikird commited on Dec 2, 2024

Commit

8e83c5f

1 Parent(s): f424b55

Enhance content processing with better summarization and topic extraction

Browse files

Files changed (2) hide show

app.py +41 -36
search_engine.py +91 -73

app.py CHANGED Viewed

@@ -4,7 +4,6 @@ import torch
 import os
 import logging
 import traceback
-import textwrap
 # Configure logging
 logging.basicConfig(
@@ -29,48 +28,54 @@ def safe_search(query, max_results):
         return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
 def format_results(results):
-    """Format search results into a readable markdown string"""
-    if 'error' in results:
-        return f"❌ Error: {results['error']}"
-    output = []
     # Add insights section
-    if 'insights' in results and results['insights']:
-        output.append("# 🔍 Search Results\n")
-        output.append("## 💡 Key Insights")
-        output.append(results['insights'])
-        output.append("")
-    # Add key points section
-    if 'key_points' in results and results['key_points']:
-        output.append("## 🎯 Key Points")
-        for point in results['key_points']:
-            output.append(f"• {point}")
-        output.append("")
-    # Add detailed results section
-    if 'results' in results and results['results']:
-        output.append("## 📄 Detailed Results")
         for i, result in enumerate(results['results'], 1):
-            output.append(f"### {i}. {result['title']}")
-            if 'description' in result and result['description']:
-                output.append(f"*{result['description']}*")
-            if 'summary' in result and result['summary']:
-                # Wrap summary text for better readability
-                wrapped_summary = textwrap.fill(result['summary'], width=80)
-                output.append(f"\n{wrapped_summary}")
             if 'url' in result:
-                output.append(f"\n🔗 [Read more]({result['url']})")
-            output.append("")
-    # Add follow-up questions section
-    if 'follow_up_questions' in results and results['follow_up_questions']:
-        output.append("## ❓ Follow-up Questions")
-        for question in results['follow_up_questions']:
-            output.append(f"• {question}")
-    return "\n".join(output)
 def create_demo():
     """Create the Gradio interface"""

 import os
 import logging
 import traceback
 # Configure logging
 logging.basicConfig(
         return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
 def format_results(results):
+    """Format search results for display"""
+    if not results or not results.get('results'):
+        return "# ⚠️ No Results\nNo search results were found. Please try a different query."
+    formatted = f"# 🔍 Search Results\n\n"
     # Add insights section
+    if 'insights' in results:
+        formatted += f"## 💡 Key Insights\n{results['insights']}\n\n"
+    # Add follow-up questions
+    if 'follow_up_questions' in results:
+        formatted += "## ❓ Follow-up Questions\n"
+        for q in results['follow_up_questions']:
+            if q and q.strip():
+                formatted += f"- {q.strip()}\n"
+        formatted += "\n"
+    # Add main results
+    if 'results' in results:
+        formatted += "## 📄 Detailed Results\n\n"
         for i, result in enumerate(results['results'], 1):
+            if not isinstance(result, dict):
+                continue
+            formatted += f"### {i}. "
             if 'url' in result:
+                title = result.get('title', 'Untitled')
+                formatted += f"[{title}]({result['url']})\n"
+            if 'summary' in result:
+                formatted += f"\n{result['summary']}\n\n"
+    # Add similar chunks if available
+    if 'similar_chunks' in results:
+        formatted += "## 🔍 Related Content\n\n"
+        for i, chunk in enumerate(results['similar_chunks'], 1):
+            if not isinstance(chunk, dict):
+                continue
+            formatted += f"### Related {i}\n"
+            if 'metadata' in chunk:
+                meta = chunk['metadata']
+                if 'title' in meta and 'url' in meta:
+                    formatted += f"From [{meta['title']}]({meta['url']})\n"
+            if 'content' in chunk:
+                formatted += f"\n{chunk['content'][:200]}...\n\n"
+    return formatted
 def create_demo():
     """Create the Gradio interface"""

search_engine.py CHANGED Viewed

@@ -47,98 +47,97 @@ class ContentProcessor:
     def clean_text(self, text: str) -> str:
         """Clean and normalize text content"""
-        # Remove extra whitespace
         text = ' '.join(text.split())
         # Remove common navigation elements
-        nav_elements = [
             "skip to content",
-            "skip to navigation",
             "search",
             "menu",
-            "submit",
             "subscribe",
-            "browse",
-            "explore",
-            "more",
-            "all press releases",
-            "media resources",
-            "media contacts",
-            "investor relations"
         ]
-        for element in nav_elements:
-            text = text.replace(element.lower(), "")
-        return text.strip()
-    def extract_main_content(self, soup: BeautifulSoup) -> str:
-        """Extract main content from HTML"""
-        # Remove navigation, header, footer, and sidebar elements
-        for elem in soup.find_all(['nav', 'header', 'footer', 'aside']):
-            elem.decompose()
-        # Remove common non-content elements
-        for elem in soup.find_all(class_=['menu', 'navigation', 'sidebar', 'footer', 'header']):
-            elem.decompose()
-        # Extract text from remaining elements
-        content_elements = []
-        # Look for main content containers
-        main_content = soup.find(['main', 'article', 'div'], class_=['content', 'main', 'article'])
-        if main_content:
-            content_elements.append(main_content.get_text())
-        else:
-            # If no main content found, look for paragraphs
-            paragraphs = soup.find_all('p')
-            content_elements.extend(p.get_text() for p in paragraphs)
-        return ' '.join(content_elements)
-    def extract_key_points(self, text: str) -> List[str]:
-        """Extract key points from text"""
-        # Split into sentences
-        sentences = text.split('.')
-        key_points = []
-        for sentence in sentences:
-            # Look for sentences with important keywords
-            keywords = ['quantum', 'computer', 'research', 'development', 'breakthrough', 'innovation']
-            if any(keyword in sentence.lower() for keyword in keywords):
-                cleaned = sentence.strip()
-                if cleaned and len(cleaned) > 20:  # Avoid very short sentences
-                    key_points.append(cleaned)
-        return key_points[:5]  # Return top 5 key points
-    def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
         """Process content and generate insights"""
         try:
-            # Extract main content if HTML is available
-            if soup:
-                content = self.extract_main_content(soup)
             # Clean the text
-            cleaned_content = self.clean_text(content)
             # Extract key points
-            key_points = self.extract_key_points(cleaned_content)
-            # Generate summary
             summary = self.model_manager.models['summarizer'](
-                cleaned_content[:1024],
                 max_length=150,
                 min_length=50,
                 do_sample=False
             )[0]['summary_text']
             return {
                 'summary': summary,
                 'key_points': key_points,
-                'content': cleaned_content
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
                 'key_points': [],
                 'content': content
             }
@@ -215,18 +214,27 @@ class WebSearchEngine:
             response = self.safe_get(url)
             soup = BeautifulSoup(response.text, 'lxml')
-            # Process content with HTML context
-            processed = self.processor.process_content(response.text, soup)
             # Get metadata
             metadata = self.get_metadata(soup)
             return {
                 'url': url,
                 'title': metadata['title'],
                 'description': metadata['description'],
                 'summary': processed['summary'],
                 'key_points': processed['key_points'],
                 'content': processed['content']
             }
@@ -299,38 +307,48 @@ class WebSearchEngine:
             results = []
             all_key_points = []
             for result in search_results:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
                         results.append(processed)
                         if 'key_points' in processed:
                             all_key_points.extend(processed['key_points'])
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
-            # Combine all summaries and key points
-            combined_summary = " ".join([r['summary'] for r in results if 'summary' in r])
-            # Generate final insights
-            insights = self.processor.model_manager.models['summarizer'](
-                combined_summary,
                 max_length=200,
                 min_length=100,
                 do_sample=False
             )[0]['summary_text']
             return {
                 'results': results,
-                'insights': insights,
-                'key_points': list(set(all_key_points)),  # Remove duplicates
                 'follow_up_questions': [
-                    f"What are the key differences between {query} and related topics?",
-                    f"Can you explain {query} in simple terms?",
-                    f"What are the latest developments in {query}?"
                 ]
             }

     def clean_text(self, text: str) -> str:
         """Clean and normalize text content"""
+        # Remove extra whitespace and normalize
+        lines = [line.strip() for line in text.splitlines()]
+        text = ' '.join(line for line in lines if line)
+        # Remove redundant spaces
         text = ' '.join(text.split())
         # Remove common navigation elements
+        nav_patterns = [
             "skip to content",
             "search",
             "menu",
+            "navigation",
             "subscribe",
+            "sign in",
+            "log in"
         ]
+        for pattern in nav_patterns:
+            text = text.replace(pattern, "")
+        return text
+    def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
+        """Extract key points from text using the summarizer"""
+        try:
+            # Split text into chunks of ~1000 characters
+            chunks = [text[i:i + 1000] for i in range(0, len(text), 1000)]
+            all_points = []
+            for chunk in chunks[:3]:  # Process first 3 chunks only
+                summary = self.model_manager.models['summarizer'](
+                    chunk,
+                    max_length=100,
+                    min_length=30,
+                    do_sample=False
+                )[0]['summary_text']
+                # Split into sentences and add as points
+                sentences = [s.strip() for s in summary.split('.') if s.strip()]
+                all_points.extend(sentences)
+            # Return unique points, limited to max_points
+            unique_points = list(dict.fromkeys(all_points))
+            return unique_points[:max_points]
+        except Exception as e:
+            logger.error(f"Error extracting key points: {str(e)}")
+            return []
+    def process_content(self, content: str) -> Dict:
         """Process content and generate insights"""
         try:
             # Clean the text
+            cleaned_text = self.clean_text(content)
             # Extract key points
+            key_points = self.extract_key_points(cleaned_text)
+            # Generate a concise summary
             summary = self.model_manager.models['summarizer'](
+                cleaned_text[:1024],
                 max_length=150,
                 min_length=50,
                 do_sample=False
             )[0]['summary_text']
+            # Extract potential topics/keywords
+            topics = []
+            common_topics = [
+                "quantum computing", "quantum processors", "quantum bits",
+                "quantum algorithms", "quantum supremacy", "quantum advantage",
+                "error correction", "quantum hardware", "quantum software",
+                "quantum research", "quantum applications"
+            ]
+            for topic in common_topics:
+                if topic.lower() in cleaned_text.lower():
+                    topics.append(topic)
             return {
                 'summary': summary,
                 'key_points': key_points,
+                'topics': topics[:5],  # Limit to top 5 topics
+                'content': cleaned_text
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
                 'key_points': [],
+                'topics': [],
                 'content': content
             }
             response = self.safe_get(url)
             soup = BeautifulSoup(response.text, 'lxml')
+            # Extract text content
+            for script in soup(["script", "style"]):
+                script.decompose()
+            text = soup.get_text()
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            content = ' '.join(chunk for chunk in chunks if chunk)
             # Get metadata
             metadata = self.get_metadata(soup)
+            # Process content
+            processed = self.processor.process_content(content)
             return {
                 'url': url,
                 'title': metadata['title'],
                 'description': metadata['description'],
                 'summary': processed['summary'],
                 'key_points': processed['key_points'],
+                'topics': processed['topics'],
                 'content': processed['content']
             }
             results = []
             all_key_points = []
+            all_topics = set()
             for result in search_results:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
                         results.append(processed)
+                        # Collect key points and topics
                         if 'key_points' in processed:
                             all_key_points.extend(processed['key_points'])
+                        if 'topics' in processed:
+                            all_topics.update(processed.get('topics', []))
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
+            # Combine all summaries
+            all_summaries = " ".join([r['summary'] for r in results if 'summary' in r])
+            # Generate a meta-summary of all content
+            meta_summary = self.processor.model_manager.models['summarizer'](
+                all_summaries[:1024],
                 max_length=200,
                 min_length=100,
                 do_sample=False
             )[0]['summary_text']
+            # Get unique key points
+            unique_key_points = list(dict.fromkeys(all_key_points))
             return {
                 'results': results,
+                'insights': {
+                    'summary': meta_summary,
+                    'key_points': unique_key_points[:7],  # Top 7 key points
+                    'topics': list(all_topics)[:5]  # Top 5 topics
+                },
                 'follow_up_questions': [
+                    f"What are the recent breakthroughs in {', '.join(list(all_topics)[:2])}?",
+                    f"How do these developments impact the future of quantum computing?",
+                    f"What are the practical applications of these quantum computing advances?"
                 ]
             }