Spaces:

tensor-boy
/

aiws

Build error

App Files Files Community

fikird commited on Dec 2, 2024

Commit

dd884bf

1 Parent(s): 5e3672b

Switch to HTML-based DuckDuckGo search for better reliability

Browse files

Files changed (1) hide show

search_engine.py +60 -60

search_engine.py CHANGED Viewed

@@ -1,13 +1,12 @@
 from typing import Dict, List, Any
 import requests
 from bs4 import BeautifulSoup
-from duckduckgo_search import DDGS
 from transformers import pipeline
 from langchain_community.embeddings import HuggingFaceEmbeddings
 import time
 import json
 import os
-from urllib.parse import urlparse
 import logging
 import random
@@ -76,22 +75,14 @@ class WebSearchEngine:
         self.request_delay = 2.0
         self.last_request_time = 0
         self.max_retries = 3
-        self.ddgs = None
         self.headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
-        self.initialize_search()
-    def initialize_search(self):
-        """Initialize DuckDuckGo search with retries"""
-        for _ in range(self.max_retries):
-            try:
-                self.ddgs = DDGS()
-                return
-            except Exception as e:
-                logger.error(f"Error initializing DDGS: {str(e)}")
-                time.sleep(random.uniform(1, 3))
-        raise Exception("Failed to initialize DuckDuckGo search after multiple attempts")
     def safe_get(self, url: str, max_retries: int = 3) -> requests.Response:
         """Make a GET request with retries and error handling"""
@@ -172,58 +163,67 @@ class WebSearchEngine:
         except Exception as e:
             return {'error': f"Error processing {url}: {str(e)}"}
-    def search(self, query: str, max_results: int = 5) -> Dict:
-        """Perform search and process results"""
         try:
-            # Initialize search if needed
-            if self.ddgs is None:
-                self.initialize_search()
-            # Add delay before search
-            time.sleep(random.uniform(1, 2))
-            # Search using DuckDuckGo with retries
-            search_results = []
-            retry_count = 0
-            while retry_count < self.max_retries and len(search_results) < max_results:
                 try:
-                    # Try different regions if search fails
-                    regions = ['wt-wt', 'us-en', 'uk-en']
-                    for region in regions:
-                        if len(search_results) >= max_results:
-                            break
-                        results_gen = self.ddgs.text(
-                            query,
-                            region=region,
-                            max_results=max_results - len(search_results)
-                        )
-                        for result in results_gen:
-                            if len(search_results) >= max_results:
-                                break
-                            if result and isinstance(result, dict) and 'link' in result:
-                                search_results.append(result)
-                                time.sleep(random.uniform(0.2, 0.5))
-                        if search_results:
-                            break
-                    if search_results:
-                        break
                 except Exception as e:
-                    retry_count += 1
-                    if retry_count >= self.max_retries:
-                        logger.error(f"Search failed after {self.max_retries} attempts: {str(e)}")
-                        if not search_results:
-                            return {'error': f"Search failed after {self.max_retries} attempts: {str(e)}"}
-                        break
-                    logger.warning(f"Search attempt {retry_count} failed: {str(e)}")
-                    time.sleep(random.uniform(2, 5))
-                    self.initialize_search()
             if not search_results:
                 return {'error': 'No results found'}

 from typing import Dict, List, Any
 import requests
 from bs4 import BeautifulSoup
 from transformers import pipeline
 from langchain_community.embeddings import HuggingFaceEmbeddings
 import time
 import json
 import os
+from urllib.parse import urlparse, quote_plus
 import logging
 import random
         self.request_delay = 2.0
         self.last_request_time = 0
         self.max_retries = 3
         self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'DNT': '1',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1'
         }
     def safe_get(self, url: str, max_retries: int = 3) -> requests.Response:
         """Make a GET request with retries and error handling"""
         except Exception as e:
             return {'error': f"Error processing {url}: {str(e)}"}
+    def search_duckduckgo(self, query: str, max_results: int = 5) -> List[Dict]:
+        """Search DuckDuckGo and parse HTML results"""
+        search_results = []
         try:
+            # Encode query for URL
+            encoded_query = quote_plus(query)
+            # DuckDuckGo HTML search URL
+            search_url = f'https://html.duckduckgo.com/html/?q={encoded_query}'
+            # Get search results page
+            response = self.safe_get(search_url)
+            soup = BeautifulSoup(response.text, 'lxml')
+            # Find all result elements
+            results = soup.find_all('div', {'class': 'result'})
+            for result in results[:max_results]:
                 try:
+                    # Extract link
+                    link_elem = result.find('a', {'class': 'result__a'})
+                    if not link_elem:
+                        continue
+                    link = link_elem.get('href', '')
+                    if not link or not self.is_valid_url(link):
+                        continue
+                    # Extract title
+                    title = link_elem.get_text(strip=True)
+                    # Extract snippet
+                    snippet_elem = result.find('a', {'class': 'result__snippet'})
+                    snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
+                    search_results.append({
+                        'link': link,
+                        'title': title,
+                        'snippet': snippet
+                    })
+                    # Add delay between processing results
+                    time.sleep(random.uniform(0.2, 0.5))
                 except Exception as e:
+                    logger.warning(f"Error processing search result: {str(e)}")
+                    continue
+            return search_results
+        except Exception as e:
+            logger.error(f"Error during DuckDuckGo search: {str(e)}")
+            return []
+    def search(self, query: str, max_results: int = 5) -> Dict:
+        """Perform search and process results"""
+        try:
+            # Search using DuckDuckGo HTML
+            search_results = self.search_duckduckgo(query, max_results)
             if not search_results:
                 return {'error': 'No results found'}