Spaces:

tensor-boy
/

aiws

Build error

App Files Files Community

fikird commited on Dec 2, 2024

Commit

2f58cc7

1 Parent(s): f2c01c1

Improve content processing with better extraction and formatting

Browse files

Files changed (1) hide show

search_engine.py +114 -105

search_engine.py CHANGED Viewed

@@ -51,111 +51,101 @@ class ContentProcessor:
         text = ' '.join(text.split())
         # Remove common navigation elements
         nav_elements = [
-            "skip to content",
-            "search",
-            "menu",
-            "navigation",
-            "subscribe",
-            "sign in",
-            "log in",
-            "submit",
-            "browse",
         ]
         for element in nav_elements:
-            text = text.replace(element.lower(), "")
         return text.strip()
     def extract_main_content(self, soup: BeautifulSoup) -> str:
-        """Extract main content from HTML, prioritizing article content"""
-        content = ""
-        # Try to find main content containers
-        priority_tags = [
-            ('article', {}),
-            ('div', {'class': ['article', 'post', 'content', 'main']}),
-            ('div', {'id': ['article', 'post', 'content', 'main']}),
-            ('main', {}),
-        ]
-        for tag, attrs in priority_tags:
-            elements = soup.find_all(tag, attrs)
-            if elements:
-                content = " ".join(elem.get_text(strip=True) for elem in elements)
-                if content:
-                    break
-        # If no main content found, try extracting paragraphs
-        if not content:
-            paragraphs = soup.find_all('p')
-            content = " ".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 100)
-        return self.clean_text(content)
-    def extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
-        """Extract key points from text using sentence transformers"""
-        try:
-            # Split into sentences
-            sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 20]
-            if not sentences:
-                return []
-            # Get embeddings for sentences
-            embeddings = self.model_manager.models['embeddings'].embed_documents(sentences)
-            # Use simple clustering to find diverse sentences
-            selected_indices = [0]  # Start with first sentence
-            for _ in range(min(max_points - 1, len(sentences) - 1)):
-                # Find sentence most different from selected ones
-                max_diff = -1
-                max_idx = -1
-                for i in range(len(sentences)):
-                    if i not in selected_indices:
-                        # Calculate average difference from selected sentences
-                        diffs = [sum((embeddings[i][j] - embeddings[k][j])**2
-                                for j in range(len(embeddings[i])))
-                                for k in selected_indices]
-                        avg_diff = sum(diffs) / len(diffs)
-                        if avg_diff > max_diff:
-                            max_diff = avg_diff
-                            max_idx = i
-                if max_idx != -1:
-                    selected_indices.append(max_idx)
-            return [sentences[i] for i in selected_indices]
-        except Exception as e:
-            logger.error(f"Error extracting key points: {str(e)}")
-            return []
-    def process_content(self, content: str, soup: BeautifulSoup = None) -> Dict:
         """Process content and generate insights"""
         try:
-            # Extract main content if HTML is available
-            if soup:
-                content = self.extract_main_content(soup)
-            else:
-                content = self.clean_text(content)
-            # Generate summary
-            summary = self.model_manager.models['summarizer'](
-                content[:1024],
-                max_length=150,
-                min_length=50,
-                do_sample=False
-            )[0]['summary_text']
-            # Extract key points
-            key_points = self.extract_key_points(content)
             return {
-                'summary': summary,
-                'content': content,
-                'key_points': key_points
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
-                'content': content,
-                'key_points': []
             }
 class WebSearchEngine:
@@ -234,8 +224,11 @@ class WebSearchEngine:
             # Get metadata
             metadata = self.get_metadata(soup)
-            # Process content
-            processed = self.processor.process_content("", soup=soup)
             return {
                 'url': url,
@@ -249,6 +242,35 @@ class WebSearchEngine:
         except Exception as e:
             return {'error': f"Error processing {url}: {str(e)}"}
     def search_duckduckgo(self, query: str, max_results: int = 5) -> List[Dict]:
         """Search DuckDuckGo and parse HTML results"""
         search_results = []
@@ -314,35 +336,22 @@ class WebSearchEngine:
                 return {'error': 'No results found'}
             results = []
-            all_key_points = []
             for result in search_results:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
                         results.append(processed)
-                        if 'key_points' in processed:
-                            all_key_points.extend(processed['key_points'])
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
-            # Combine insights from all results
-            combined_summary = " ".join([r['summary'] for r in results if 'summary' in r])
-            # Generate overall insights
-            insights = self.processor.model_manager.models['summarizer'](
-                combined_summary,
-                max_length=200,
-                min_length=100,
-                do_sample=False
-            )[0]['summary_text']
             return {
-                'results': results,
-                'insights': insights,
-                'key_points': all_key_points[:10],  # Top 10 key points
                 'follow_up_questions': [
                     f"What are the recent breakthroughs in {query}?",
                     f"How does {query} impact various industries?",

         text = ' '.join(text.split())
         # Remove common navigation elements
         nav_elements = [
+            "Skip to content",
+            "Search",
+            "Menu",
+            "Navigation",
+            "Subscribe",
+            "Browse",
+            "Submit",
+            "More",
+            "About",
+            "Contact",
+            "Privacy Policy",
+            "Terms of Use"
         ]
         for element in nav_elements:
+            text = text.replace(element, "")
         return text.strip()
     def extract_main_content(self, soup: BeautifulSoup) -> str:
+        """Extract main content from HTML"""
+        # Remove navigation, headers, footers
+        for elem in soup.find_all(['nav', 'header', 'footer', 'script', 'style', 'meta', 'link']):
+            elem.decompose()
+        # Try to find main content container
+        main_content = None
+        content_tags = ['article', 'main', '[role="main"]', '.content', '#content', '.post', '.entry']
+        for tag in content_tags:
+            main_content = soup.select_one(tag)
+            if main_content:
+                break
+        if not main_content:
+            main_content = soup
+        # Extract text from paragraphs
+        paragraphs = main_content.find_all('p')
+        if paragraphs:
+            return ' '.join(p.get_text(strip=True) for p in paragraphs)
+        # Fallback to all text if no paragraphs found
+        return main_content.get_text(strip=True)
+    def process_content(self, content: str, html_content: str = None) -> Dict:
         """Process content and generate insights"""
         try:
+            # Clean content
+            cleaned_content = self.clean_text(content)
+            # If HTML content is provided, try to extract main content
+            if html_content:
+                soup = BeautifulSoup(html_content, 'lxml')
+                main_content = self.extract_main_content(soup)
+                if main_content:
+                    cleaned_content = self.clean_text(main_content)
+            # Generate summary in chunks if content is too long
+            chunks = [cleaned_content[i:i+1024] for i in range(0, len(cleaned_content), 1024)]
+            summaries = []
+            for chunk in chunks[:3]:  # Process up to 3 chunks to avoid too long processing
+                try:
+                    summary = self.model_manager.models['summarizer'](
+                        chunk,
+                        max_length=150,
+                        min_length=50,
+                        do_sample=False
+                    )[0]['summary_text']
+                    summaries.append(summary)
+                except Exception as e:
+                    logger.warning(f"Error summarizing chunk: {str(e)}")
+                    continue
+            # Combine summaries
+            final_summary = ' '.join(summaries)
+            # Extract key points using bullet points
+            key_points = self.model_manager.models['summarizer'](
+                cleaned_content[:1024],
+                max_length=100,
+                min_length=30,
+                num_beams=4,
+                do_sample=True
+            )[0]['summary_text']
             return {
+                'summary': final_summary,
+                'key_points': key_points,
+                'content': cleaned_content
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
+                'key_points': "",
+                'content': content
             }
 class WebSearchEngine:
             # Get metadata
             metadata = self.get_metadata(soup)
+            # Process content with both text and HTML
+            processed = self.processor.process_content(
+                soup.get_text(),
+                html_content=response.text
+            )
             return {
                 'url': url,
         except Exception as e:
             return {'error': f"Error processing {url}: {str(e)}"}
+    def format_results(self, results: List[Dict]) -> Dict:
+        """Format search results in a user-friendly way"""
+        formatted_insights = []
+        formatted_results = []
+        for result in results:
+            if 'error' not in result:
+                # Format key points
+                if result.get('key_points'):
+                    points = result['key_points'].split('. ')
+                    formatted_points = [f"• {point.strip()}" for point in points if point.strip()]
+                    formatted_insights.extend(formatted_points)
+                # Format detailed result
+                formatted_result = {
+                    'title': result['title'],
+                    'url': result['url'],
+                    'summary': result['summary'],
+                }
+                formatted_results.append(formatted_result)
+        # Remove duplicates while preserving order
+        formatted_insights = list(dict.fromkeys(formatted_insights))
+        return {
+            'insights': '\n'.join(formatted_insights[:10]),  # Top 10 insights
+            'results': formatted_results
+        }
     def search_duckduckgo(self, query: str, max_results: int = 5) -> List[Dict]:
         """Search DuckDuckGo and parse HTML results"""
         search_results = []
                 return {'error': 'No results found'}
             results = []
             for result in search_results:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
                         results.append(processed)
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
+            # Format results in a user-friendly way
+            formatted = self.format_results(results)
             return {
+                'results': formatted['results'],
+                'insights': formatted['insights'],
                 'follow_up_questions': [
                     f"What are the recent breakthroughs in {query}?",
                     f"How does {query} impact various industries?",