Spaces:

tensor-boy
/

aiws

Build error

App Files Files Community

fikird commited on Dec 2, 2024

Commit

03649cb

1 Parent(s): 636f8ae

Improve content processing and result formatting

Browse files

Files changed (2) hide show

app.py +35 -42
search_engine.py +74 -60

app.py CHANGED Viewed

@@ -28,54 +28,44 @@ def safe_search(query, max_results):
         return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
 def format_results(results):
-    """Format search results for display"""
-    if not results or not results.get('results'):
-        return "# ⚠️ No Results\nNo search results were found. Please try a different query."
-    formatted = f"# 🔍 Search Results\n\n"
     # Add insights section
     if 'insights' in results:
-        formatted += f"## 💡 Key Insights\n{results['insights']}\n\n"
     # Add follow-up questions
     if 'follow_up_questions' in results:
-        formatted += "## ❓ Follow-up Questions\n"
-        for q in results['follow_up_questions']:
-            if q and q.strip():
-                formatted += f"- {q.strip()}\n"
-        formatted += "\n"
-    # Add main results
-    if 'results' in results:
-        formatted += "## 📄 Detailed Results\n\n"
-        for i, result in enumerate(results['results'], 1):
-            if not isinstance(result, dict):
-                continue
-            formatted += f"### {i}. "
-            if 'url' in result:
-                title = result.get('title', 'Untitled')
-                formatted += f"[{title}]({result['url']})\n"
-            if 'summary' in result:
-                formatted += f"\n{result['summary']}\n\n"
-    # Add similar chunks if available
-    if 'similar_chunks' in results:
-        formatted += "## 🔍 Related Content\n\n"
-        for i, chunk in enumerate(results['similar_chunks'], 1):
-            if not isinstance(chunk, dict):
-                continue
-            formatted += f"### Related {i}\n"
-            if 'metadata' in chunk:
-                meta = chunk['metadata']
-                if 'title' in meta and 'url' in meta:
-                    formatted += f"From [{meta['title']}]({meta['url']})\n"
-            if 'content' in chunk:
-                formatted += f"\n{chunk['content'][:200]}...\n\n"
-    return formatted
 def create_demo():
     """Create the Gradio interface"""
@@ -100,7 +90,10 @@ def create_demo():
                 )
                 search_button = gr.Button("🔍 Search")
-        output = gr.Markdown()
         search_button.click(
             fn=safe_search,

         return f"# ❌ Error\nSorry, an error occurred while processing your search:\n```\n{str(e)}\n```"
 def format_results(results):
+    """Format search results into a clean markdown output"""
+    if 'error' in results:
+        return f"❌ Error: {results['error']}"
+    output = []
     # Add insights section
     if 'insights' in results:
+        insights = results['insights']
+        output.append("# 🎯 Key Insights\n")
+        if 'summary' in insights:
+            output.append(insights['summary'])
+            output.append("\n")
+        if 'key_points' in insights and len(insights['key_points']) > 5:
+            output.append("\n## 📌 Additional Points\n")
+            for point in insights['key_points'][5:]:
+                output.append(f"• {point}")
+            output.append("\n")
+    # Add sources section
+    if 'insights' in results and 'sources' in results['insights']:
+        output.append("\n# 📚 Sources\n")
+        for idx, source in enumerate(results['insights']['sources'], 1):
+            output.append(f"\n## {idx}. {source['title']}\n")
+            if 'url' in source:
+                output.append(f"🔗 [View Source]({source['url']})\n")
+            if 'summary' in source:
+                output.append(f"\n{source['summary']}\n")
     # Add follow-up questions
     if 'follow_up_questions' in results:
+        output.append("\n# ❓ Suggested Questions\n")
+        for question in results['follow_up_questions']:
+            output.append(f"• {question}\n")
+    return "\n".join(output)
 def create_demo():
     """Create the Gradio interface"""
                 )
                 search_button = gr.Button("🔍 Search")
+        output = gr.Markdown(
+            label="Search Results",
+            show_label=True
+        )
         search_button.click(
             fn=safe_search,

search_engine.py CHANGED Viewed

@@ -49,77 +49,86 @@ class ContentProcessor:
         """Clean and normalize text content"""
         # Remove extra whitespace
         text = ' '.join(text.split())
-        # Remove redundant headers and navigation text
-        common_headers = ['skip to content', 'search', 'menu', 'navigation', 'subscribe']
-        lines = []
-        for line in text.split('\n'):
-            line = line.strip().lower()
-            if not any(header in line for header in common_headers) and len(line) > 20:
-                lines.append(line)
-        return ' '.join(lines)
-    def extract_key_points(self, content: str) -> List[str]:
         """Extract key points from content using AI"""
         try:
-            # Split content into chunks for processing
-            chunks = [content[i:i+1024] for i in range(0, len(content), 1024)]
-            key_points = []
             for chunk in chunks:
-                # Generate focused summary for each chunk
                 summary = self.model_manager.models['summarizer'](
                     chunk,
-                    max_length=150,
-                    min_length=50,
-                    do_sample=False,
-                    num_beams=4,
-                    length_penalty=2.0,
-                    early_stopping=True
                 )[0]['summary_text']
-                key_points.append(summary)
-            return key_points
         except Exception as e:
             logger.error(f"Error extracting key points: {str(e)}")
             return []
-    def process_content(self, content: str, title: str = "", description: str = "") -> Dict:
         """Process content and generate insights"""
         try:
             # Clean the content
             cleaned_content = self.clean_text(content)
-            # Combine title and description with content for context
-            if title:
-                cleaned_content = f"{title}. {cleaned_content}"
-            if description:
-                cleaned_content = f"{description}. {cleaned_content}"
             # Extract key points
             key_points = self.extract_key_points(cleaned_content)
-            # Generate overall summary
-            summary = self.model_manager.models['summarizer'](
-                ' '.join(key_points)[:1024],
-                max_length=200,
-                min_length=100,
-                do_sample=False,
-                num_beams=4,
-                length_penalty=2.0,
-                early_stopping=True
-            )[0]['summary_text']
             return {
                 'summary': summary,
                 'key_points': key_points,
-                'content': cleaned_content
             }
         except Exception as e:
             return {
                 'summary': f"Error processing content: {str(e)}",
                 'key_points': [],
-                'content': content
             }
 class WebSearchEngine:
@@ -207,15 +216,16 @@ class WebSearchEngine:
             metadata = self.get_metadata(soup)
             # Process content
-            processed = self.processor.process_content(content, metadata['title'], metadata['description'])
             return {
                 'url': url,
                 'title': metadata['title'],
                 'description': metadata['description'],
                 'summary': processed['summary'],
                 'key_points': processed['key_points'],
-                'content': processed['content']
             }
         except Exception as e:
@@ -292,36 +302,40 @@ class WebSearchEngine:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
-                        # Add original search snippet
-                        processed['snippet'] = result.get('snippet', '')
                         results.append(processed)
-                        # Collect key points
                         if 'key_points' in processed:
                             all_key_points.extend(processed['key_points'])
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
-            # Generate comprehensive insights
-            insights = []
-            if all_key_points:
-                # Group similar points and remove duplicates
-                unique_points = list(set(all_key_points))
-                insights = self.processor.extract_key_points(' '.join(unique_points))
-            # Generate relevant follow-up questions
-            follow_up_questions = [
-                f"What are the practical applications of {query}?",
-                f"How does {query} impact industry and research?",
-                f"What challenges and limitations exist in {query}?",
-                f"What future developments are expected in {query}?"
-            ]
             return {
                 'results': results,
-                'insights': insights if insights else ["No comprehensive insights available."],
-                'follow_up_questions': follow_up_questions
             }
         except Exception as e:

         """Clean and normalize text content"""
         # Remove extra whitespace
         text = ' '.join(text.split())
+        # Remove common navigation elements
+        nav_elements = ['Skip to content', 'Search', 'Menu', 'Navigation', 'Subscribe', 'Follow']
+        for elem in nav_elements:
+            text = text.replace(elem, '')
+        return text.strip()
+    def extract_key_points(self, content: str, max_points: int = 5) -> List[str]:
         """Extract key points from content using AI"""
         try:
+            # Split content into smaller chunks for processing
+            chunks = [content[i:i + 1024] for i in range(0, len(content), 1024)]
+            all_points = []
             for chunk in chunks:
                 summary = self.model_manager.models['summarizer'](
                     chunk,
+                    max_length=100,
+                    min_length=30,
+                    do_sample=False
                 )[0]['summary_text']
+                # Split summary into sentences
+                points = [p.strip() for p in summary.split('.') if p.strip()]
+                all_points.extend(points)
+            # Return unique points, prioritizing longer, more informative ones
+            unique_points = list(set(all_points))
+            unique_points.sort(key=len, reverse=True)
+            return unique_points[:max_points]
         except Exception as e:
             logger.error(f"Error extracting key points: {str(e)}")
             return []
+    def process_content(self, content: str) -> Dict:
         """Process content and generate insights"""
         try:
             # Clean the content
             cleaned_content = self.clean_text(content)
+            if not cleaned_content:
+                return {
+                    'summary': "No meaningful content found",
+                    'content': content,
+                    'key_points': [],
+                    'topics': []
+                }
+            # Generate summary
+            summary = self.model_manager.models['summarizer'](
+                cleaned_content[:1024],
+                max_length=150,
+                min_length=50,
+                do_sample=False
+            )[0]['summary_text']
             # Extract key points
             key_points = self.extract_key_points(cleaned_content)
+            # Extract main topics using embeddings
+            embeddings = self.model_manager.models['embeddings'].embed_documents(
+                [cleaned_content[:2048]]
+            )
+            # You could add topic modeling here if needed
             return {
                 'summary': summary,
+                'content': cleaned_content,
                 'key_points': key_points,
+                'topics': []  # Reserved for future topic modeling
             }
         except Exception as e:
+            logger.error(f"Error processing content: {str(e)}")
             return {
                 'summary': f"Error processing content: {str(e)}",
+                'content': content,
                 'key_points': [],
+                'topics': []
             }
 class WebSearchEngine:
             metadata = self.get_metadata(soup)
             # Process content
+            processed = self.processor.process_content(content)
             return {
                 'url': url,
                 'title': metadata['title'],
                 'description': metadata['description'],
                 'summary': processed['summary'],
+                'content': processed['content'],
                 'key_points': processed['key_points'],
+                'topics': processed['topics']
             }
         except Exception as e:
                 if 'link' in result:
                     processed = self.process_url(result['link'])
                     if 'error' not in processed:
                         results.append(processed)
                         if 'key_points' in processed:
                             all_key_points.extend(processed['key_points'])
                         time.sleep(random.uniform(0.5, 1.0))
             if not results:
                 return {'error': 'Failed to process any search results'}
+            # Combine and deduplicate key points
+            unique_points = list(set(all_key_points))
+            unique_points.sort(key=len, reverse=True)
+            # Generate comprehensive insights
+            insights = {
+                'summary': "Key Findings:\n" + "\n".join(f"• {point}" for point in unique_points[:5]),
+                'key_points': unique_points[:10],
+                'sources': [
+                    {
+                        'title': r.get('title', 'Untitled'),
+                        'url': r.get('url', ''),
+                        'summary': r.get('summary', '')
+                    }
+                    for r in results
+                ]
+            }
             return {
                 'results': results,
+                'insights': insights,
+                'follow_up_questions': [
+                    f"What are the practical applications of {query}?",
+                    f"How does {query} impact current technology?",
+                    f"What are the future prospects for {query}?"
+                ]
             }
         except Exception as e: