Spaces:

tensor-boy
/

aiws

Build error

App Files Files Community

fikird commited on Dec 2, 2024

Commit

f424b55

1 Parent(s): 03649cb

Improve content processing and result formatting

Browse files

Files changed (2) hide show

app.py +33 -31
search_engine.py +81 -85

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch
 import os
 import logging
 import traceback
 # Configure logging
 logging.basicConfig(
@@ -28,42 +29,46 @@ def safe_search(query, max_results):
         return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
 def format_results(results):
-    """Format search results into a clean markdown output"""
     if 'error' in results:
         return f"❌ Error: {results['error']}"
     output = []
     # Add insights section
-    if 'insights' in results:
-        insights = results['insights']
-        output.append("# 🎯 Key Insights\n")
-        if 'summary' in insights:
-            output.append(insights['summary'])
-            output.append("\n")
-        if 'key_points' in insights and len(insights['key_points']) > 5:
-            output.append("\n## 📌 Additional Points\n")
-            for point in insights['key_points'][5:]:
-                output.append(f"• {point}")
-            output.append("\n")
-    # Add sources section
-    if 'insights' in results and 'sources' in results['insights']:
-        output.append("\n# 📚 Sources\n")
-        for idx, source in enumerate(results['insights']['sources'], 1):
-            output.append(f"\n## {idx}. {source['title']}\n")
-            if 'url' in source:
-                output.append(f"🔗 [View Source]({source['url']})\n")
-            if 'summary' in source:
-                output.append(f"\n{source['summary']}\n")
-    # Add follow-up questions
-    if 'follow_up_questions' in results:
-        output.append("\n# ❓ Suggested Questions\n")
         for question in results['follow_up_questions']:
-            output.append(f"• {question}\n")
     return "\n".join(output)
@@ -90,10 +95,7 @@ def create_demo():
                 )
                 search_button = gr.Button("🔍 Search")
-        output = gr.Markdown(
-            label="Search Results",
-            show_label=True
-        )
         search_button.click(
             fn=safe_search,

 import os
 import logging
 import traceback
+import textwrap
 # Configure logging
 logging.basicConfig(
         return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
 def format_results(results):
+    """Format search results into a readable markdown string"""
     if 'error' in results:
         return f"❌ Error: {results['error']}"
     output = []
     # Add insights section
+    if 'insights' in results and results['insights']:
+        output.append("# 🔍 Search Results\n")
+        output.append("## 💡 Key Insights")
+        output.append(results['insights'])
+        output.append("")
+    # Add key points section
+    if 'key_points' in results and results['key_points']:
+        output.append("## 🎯 Key Points")
+        for point in results['key_points']:
+            output.append(f"• {point}")
+        output.append("")
+    # Add detailed results section
+    if 'results' in results and results['results']:
+        output.append("## 📄 Detailed Results")
+        for i, result in enumerate(results['results'], 1):
+            output.append(f"### {i}. {result['title']}")
+            if 'description' in result and result['description']:
+                output.append(f"*{result['description']}*")
+            if 'summary' in result and result['summary']:
+                # Wrap summary text for better readability
+                wrapped_summary = textwrap.fill(result['summary'], width=80)
+                output.append(f"\n{wrapped_summary}")
+            if 'url' in result:
+                output.append(f"\n🔗 [Read more]({result['url']})")
+            output.append("")
+    # Add follow-up questions section
+    if 'follow_up_questions' in results and results['follow_up_questions']:
+        output.append("## ❓ Follow-up Questions")
         for question in results['follow_up_questions']:
+            output.append(f"• {question}")
     return "\n".join(output)
                 )
                 search_button = gr.Button("🔍 Search")
+        output = gr.Markdown()
         search_button.click(
             fn=safe_search,

search_engine.py CHANGED Viewed

@@ -50,52 +50,77 @@ class ContentProcessor:
         # Remove extra whitespace
         text = ' '.join(text.split())
         # Remove common navigation elements
-        nav_elements = ['Skip to content', 'Search', 'Menu', 'Navigation', 'Subscribe', 'Follow']
-        for elem in nav_elements:
-            text = text.replace(elem, '')
         return text.strip()
-    def extract_key_points(self, content: str, max_points: int = 5) -> List[str]:
-        """Extract key points from content using AI"""
-        try:
-            # Split content into smaller chunks for processing
-            chunks = [content[i:i + 1024] for i in range(0, len(content), 1024)]
-            all_points = []
-            for chunk in chunks:
-                summary = self.model_manager.models['summarizer'](
-                    chunk,
-                    max_length=100,
-                    min_length=30,
-                    do_sample=False
-                )[0]['summary_text']
-                # Split summary into sentences
-                points = [p.strip() for p in summary.split('.') if p.strip()]
-                all_points.extend(points)
-            # Return unique points, prioritizing longer, more informative ones
-            unique_points = list(set(all_points))
-            unique_points.sort(key=len, reverse=True)
-            return unique_points[:max_points]
-        except Exception as e:
-            logger.error(f"Error extracting key points: {str(e)}")
-            return []
-    def process_content(self, content: str) -> Dict:
         """Process content and generate insights"""
         try:
-            # Clean the content
             cleaned_content = self.clean_text(content)
-            if not cleaned_content:
-                return {
-                    'summary': "No meaningful content found",
-                    'content': content,
-                    'key_points': [],
-                    'topics': []
-                }
             # Generate summary
             summary = self.model_manager.models['summarizer'](
@@ -105,30 +130,16 @@ class ContentProcessor:
                 do_sample=False
             )[0]['summary_text']
-            # Extract key points
-            key_points = self.extract_key_points(cleaned_content)
-            # Extract main topics using embeddings
-            embeddings = self.model_manager.models['embeddings'].embed_documents(
-                [cleaned_content[:2048]]
-            )
-            # You could add topic modeling here if needed
             return {
                 'summary': summary,
-                'content': cleaned_content,
                 'key_points': key_points,
-                'topics': []  # Reserved for future topic modeling
             }
         except Exception as e:
-            logger.error(f"Error processing content: {str(e)}")
             return {
                 'summary': f"Error processing content: {str(e)}",
-                'content': content,
                 'key_points': [],
-                'topics': []
             }
 class WebSearchEngine:
@@ -204,28 +215,19 @@ class WebSearchEngine:
             response = self.safe_get(url)
             soup = BeautifulSoup(response.text, 'lxml')
-            # Extract text content
-            for script in soup(["script", "style"]):
-                script.decompose()
-            text = soup.get_text()
-            lines = (line.strip() for line in text.splitlines())
-            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-            content = ' '.join(chunk for chunk in chunks if chunk)
             # Get metadata
             metadata = self.get_metadata(soup)
-            # Process content
-            processed = self.processor.process_content(content)
             return {
                 'url': url,
                 'title': metadata['title'],
                 'description': metadata['description'],
                 'summary': processed['summary'],
-                'content': processed['content'],
                 'key_points': processed['key_points'],
-                'topics': processed['topics']
             }
         except Exception as e:
@@ -306,35 +308,29 @@ class WebSearchEngine:
                         if 'key_points' in processed:
                             all_key_points.extend(processed['key_points'])
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
-            # Combine and deduplicate key points
-            unique_points = list(set(all_key_points))
-            unique_points.sort(key=len, reverse=True)
-            # Generate comprehensive insights
-            insights = {
-                'summary': "Key Findings:\n" + "\n".join(f"• {point}" for point in unique_points[:5]),
-                'key_points': unique_points[:10],
-                'sources': [
-                    {
-                        'title': r.get('title', 'Untitled'),
-                        'url': r.get('url', ''),
-                        'summary': r.get('summary', '')
-                    }
-                    for r in results
-                ]
-            }
             return {
                 'results': results,
                 'insights': insights,
                 'follow_up_questions': [
-                    f"What are the practical applications of {query}?",
-                    f"How does {query} impact current technology?",
-                    f"What are the future prospects for {query}?"
                 ]
             }

         # Remove extra whitespace
         text = ' '.join(text.split())
         # Remove common navigation elements
+        nav_elements = [
+            "skip to content",
+            "skip to navigation",
+            "search",
+            "menu",
+            "submit",
+            "subscribe",
+            "browse",
+            "explore",
+            "more",
+            "all press releases",
+            "media resources",
+            "media contacts",
+            "investor relations"
+        ]
+        for element in nav_elements:
+            text = text.replace(element.lower(), "")
         return text.strip()
+    def extract_main_content(self, soup: BeautifulSoup) -> str:
+        """Extract main content from HTML"""
+        # Remove navigation, header, footer, and sidebar elements
+        for elem in soup.find_all(['nav', 'header', 'footer', 'aside']):
+            elem.decompose()
+        # Remove common non-content elements
+        for elem in soup.find_all(class_=['menu', 'navigation', 'sidebar', 'footer', 'header']):
+            elem.decompose()
+        # Extract text from remaining elements
+        content_elements = []
+        # Look for main content containers
+        main_content = soup.find(['main', 'article', 'div'], class_=['content', 'main', 'article'])
+        if main_content:
+            content_elements.append(main_content.get_text())
+        else:
+            # If no main content found, look for paragraphs
+            paragraphs = soup.find_all('p')
+            content_elements.extend(p.get_text() for p in paragraphs)
+        return ' '.join(content_elements)
+    def extract_key_points(self, text: str) -> List[str]:
+        """Extract key points from text"""
+        # Split into sentences
+        sentences = text.split('.')
+        key_points = []
+        for sentence in sentences:
+            # Look for sentences with important keywords
+            keywords = ['quantum', 'computer', 'research', 'development', 'breakthrough', 'innovation']
+            if any(keyword in sentence.lower() for keyword in keywords):
+                cleaned = sentence.strip()
+                if cleaned and len(cleaned) > 20:  # Avoid very short sentences
+                    key_points.append(cleaned)
+        return key_points[:5]  # Return top 5 key points
+    def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
         """Process content and generate insights"""
         try:
+            # Extract main content if HTML is available
+            if soup:
+                content = self.extract_main_content(soup)
+            # Clean the text
             cleaned_content = self.clean_text(content)
+            # Extract key points
+            key_points = self.extract_key_points(cleaned_content)
             # Generate summary
             summary = self.model_manager.models['summarizer'](
                 do_sample=False
             )[0]['summary_text']
             return {
                 'summary': summary,
                 'key_points': key_points,
+                'content': cleaned_content
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
                 'key_points': [],
+                'content': content
             }
 class WebSearchEngine:
             response = self.safe_get(url)
             soup = BeautifulSoup(response.text, 'lxml')
+            # Process content with HTML context
+            processed = self.processor.process_content(response.text, soup)
             # Get metadata
             metadata = self.get_metadata(soup)
             return {
                 'url': url,
                 'title': metadata['title'],
                 'description': metadata['description'],
                 'summary': processed['summary'],
                 'key_points': processed['key_points'],
+                'content': processed['content']
             }
         except Exception as e:
                         if 'key_points' in processed:
                             all_key_points.extend(processed['key_points'])
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
+            # Combine all summaries and key points
+            combined_summary = " ".join([r['summary'] for r in results if 'summary' in r])
+            # Generate final insights
+            insights = self.processor.model_manager.models['summarizer'](
+                combined_summary,
+                max_length=200,
+                min_length=100,
+                do_sample=False
+            )[0]['summary_text']
             return {
                 'results': results,
                 'insights': insights,
+                'key_points': list(set(all_key_points)),  # Remove duplicates
                 'follow_up_questions': [
+                    f"What are the key differences between {query} and related topics?",
+                    f"Can you explain {query} in simple terms?",
+                    f"What are the latest developments in {query}?"
                 ]
             }