Spaces:

tensor-boy
/

aiws

Build error

App Files Files Community

fikird commited on Dec 2, 2024

Commit

d7b6953

1 Parent(s): a4caf5b

Update duckduckgo-search implementation and fix imports

Browse files

Files changed (2) hide show

README.md +11 -1
search_engine.py +52 -59

README.md CHANGED Viewed

@@ -1,3 +1,13 @@
 # 🔍 Intelligent Web Search Engine
 An advanced AI-powered search engine that provides deep understanding of web content, code analysis, and intelligent insights.
@@ -77,4 +87,4 @@ The search engine uses several AI models:
 ## 🤝 Contributing
-Contributions are welcome! Please feel free to submit a Pull Request.

+---
+license: mit
+title: crawling rag
+sdk: gradio
+emoji: 👁
+colorFrom: gray
+colorTo: purple
+short_description: a rag that can crawle website
+sdk_version: 5.7.1
+---
 # 🔍 Intelligent Web Search Engine
 An advanced AI-powered search engine that provides deep understanding of web content, code analysis, and intelligent insights.
 ## 🤝 Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.

search_engine.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from typing import Dict, List, Any
 import requests
 from bs4 import BeautifulSoup
-from duckduckgo_search import ddg
 from transformers import pipeline
-from langchain.embeddings import HuggingFaceEmbeddings
 import time
 import json
 import os
@@ -49,15 +49,12 @@ class ContentProcessor:
             return {
                 'summary': summary,
-                'content_type': 'text',
-                'explanation': summary
             }
         except Exception as e:
-            print(f"Error processing content: {str(e)}")
             return {
-                'summary': content[:200] + "...",
-                'content_type': 'text',
-                'explanation': "Unable to generate detailed analysis."
             }
 class WebSearchEngine:
@@ -68,104 +65,100 @@ class WebSearchEngine:
         self.session = requests.Session()
         self.request_delay = 1.0
         self.last_request_time = 0
     def is_valid_url(self, url: str) -> bool:
         """Check if URL is valid for crawling"""
         try:
             parsed = urlparse(url)
-            return bool(parsed.netloc and parsed.scheme in ['http', 'https'])
         except:
             return False
     def get_metadata(self, soup: BeautifulSoup) -> Dict:
         """Extract metadata from page"""
-        title = soup.title.string if soup.title else ""
         description = ""
         if soup.find("meta", attrs={"name": "description"}):
             description = soup.find("meta", attrs={"name": "description"}).get("content", "")
         return {
-            "title": title,
-            "description": description
         }
     def process_url(self, url: str) -> Dict:
         """Process a single URL"""
         try:
-            # Respect rate limiting
             current_time = time.time()
-            if current_time - self.last_request_time < self.request_delay:
-                time.sleep(self.request_delay - (current_time - self.last_request_time))
             response = self.session.get(url, timeout=10)
             self.last_request_time = time.time()
-            if not response.ok:
-                return None
             soup = BeautifulSoup(response.text, 'lxml')
-            metadata = self.get_metadata(soup)
-            # Extract main content
-            content = ' '.join([p.get_text() for p in soup.find_all('p')])
-            if not content:
-                return None
-            processed_content = self.processor.process_content(content)
-            processed_content['metadata'] = metadata
             return {
                 'url': url,
                 'title': metadata['title'],
-                'snippet': content[:200] + "...",
-                'processed_content': processed_content
             }
         except Exception as e:
-            print(f"Error processing {url}: {str(e)}")
-            return None
     def search(self, query: str, max_results: int = 5) -> Dict:
         """Perform search and process results"""
         try:
             # Search using DuckDuckGo
-            search_results = ddg(query, max_results=max_results)
-            # Process results
-            processed_results = []
             for result in search_results:
-                if self.is_valid_url(result['link']):
                     processed = self.process_url(result['link'])
-                    if processed:
-                        processed_results.append(processed)
-            # Generate insights
-            all_content = ' '.join([r['processed_content']['summary'] for r in processed_results if r])
-            insights = self.processor.process_content(all_content)['summary']
-            # Generate follow-up questions
-            follow_up_questions = [
-                f"What are the key differences between {query} and related topics?",
-                f"How has {query} evolved over time?",
-                f"What are the practical applications of {query}?"
-            ]
             return {
-                'results': processed_results,
-                'insights': insights,
-                'follow_up_questions': follow_up_questions,
-                'similar_queries': []
             }
         except Exception as e:
-            print(f"Error during search: {str(e)}")
-            return {
-                'results': [],
-                'insights': f"Error performing search: {str(e)}",
-                'follow_up_questions': [],
-                'similar_queries': []
-            }
 # Main search function
 def search(query: str, max_results: int = 5) -> Dict:

 from typing import Dict, List, Any
 import requests
 from bs4 import BeautifulSoup
+from duckduckgo_search import DDGS
 from transformers import pipeline
+from langchain_community.embeddings import HuggingFaceEmbeddings
 import time
 import json
 import os
             return {
                 'summary': summary,
+                'content': content
             }
         except Exception as e:
             return {
+                'summary': f"Error processing content: {str(e)}",
+                'content': content
             }
 class WebSearchEngine:
         self.session = requests.Session()
         self.request_delay = 1.0
         self.last_request_time = 0
+        self.ddgs = DDGS()
     def is_valid_url(self, url: str) -> bool:
         """Check if URL is valid for crawling"""
         try:
             parsed = urlparse(url)
+            return bool(parsed.netloc and parsed.scheme)
         except:
             return False
     def get_metadata(self, soup: BeautifulSoup) -> Dict:
         """Extract metadata from page"""
+        title = soup.title.string if soup.title else "No title"
         description = ""
         if soup.find("meta", attrs={"name": "description"}):
             description = soup.find("meta", attrs={"name": "description"}).get("content", "")
         return {
+            'title': title,
+            'description': description
         }
     def process_url(self, url: str) -> Dict:
         """Process a single URL"""
+        if not self.is_valid_url(url):
+            return {'error': f"Invalid URL: {url}"}
         try:
+            # Rate limiting
             current_time = time.time()
+            time_since_last = current_time - self.last_request_time
+            if time_since_last < self.request_delay:
+                time.sleep(self.request_delay - time_since_last)
             response = self.session.get(url, timeout=10)
             self.last_request_time = time.time()
+            if response.status_code != 200:
+                return {'error': f"Failed to fetch URL: {url}, status code: {response.status_code}"}
             soup = BeautifulSoup(response.text, 'lxml')
+            # Extract text content
+            for script in soup(["script", "style"]):
+                script.decompose()
+            text = soup.get_text()
+            lines = (line.strip() for line in text.splitlines())
+            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+            content = ' '.join(chunk for chunk in chunks if chunk)
+            # Get metadata
+            metadata = self.get_metadata(soup)
+            # Process content
+            processed = self.processor.process_content(content)
             return {
                 'url': url,
                 'title': metadata['title'],
+                'description': metadata['description'],
+                'summary': processed['summary'],
+                'content': processed['content']
             }
         except Exception as e:
+            return {'error': f"Error processing {url}: {str(e)}"}
     def search(self, query: str, max_results: int = 5) -> Dict:
         """Perform search and process results"""
         try:
             # Search using DuckDuckGo
+            search_results = list(self.ddgs.text(query, max_results=max_results))
+            results = []
             for result in search_results:
+                if 'link' in result:
                     processed = self.process_url(result['link'])
+                    if 'error' not in processed:
+                        results.append(processed)
+            # Generate insights from results
+            all_content = " ".join([r['summary'] for r in results if 'summary' in r])
             return {
+                'results': results,
+                'insights': all_content[:1000] if all_content else "No insights available.",
+                'follow_up_questions': [
+                    f"What are the key differences between {query} and related topics?",
+                    f"Can you explain {query} in simple terms?",
+                    f"What are the latest developments in {query}?"
+                ]
             }
         except Exception as e:
+            return {'error': f"Search failed: {str(e)}"}
 # Main search function
 def search(query: str, max_results: int = 5) -> Dict: