Spaces:
Build error
Build error
fikird
commited on
Commit
·
dd884bf
1
Parent(s):
5e3672b
Switch to HTML-based DuckDuckGo search for better reliability
Browse files- search_engine.py +60 -60
search_engine.py
CHANGED
@@ -1,13 +1,12 @@
|
|
1 |
from typing import Dict, List, Any
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
-
from duckduckgo_search import DDGS
|
5 |
from transformers import pipeline
|
6 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
7 |
import time
|
8 |
import json
|
9 |
import os
|
10 |
-
from urllib.parse import urlparse
|
11 |
import logging
|
12 |
import random
|
13 |
|
@@ -76,22 +75,14 @@ class WebSearchEngine:
|
|
76 |
self.request_delay = 2.0
|
77 |
self.last_request_time = 0
|
78 |
self.max_retries = 3
|
79 |
-
self.ddgs = None
|
80 |
self.headers = {
|
81 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
|
|
|
|
|
|
|
|
|
82 |
}
|
83 |
-
self.initialize_search()
|
84 |
-
|
85 |
-
def initialize_search(self):
|
86 |
-
"""Initialize DuckDuckGo search with retries"""
|
87 |
-
for _ in range(self.max_retries):
|
88 |
-
try:
|
89 |
-
self.ddgs = DDGS()
|
90 |
-
return
|
91 |
-
except Exception as e:
|
92 |
-
logger.error(f"Error initializing DDGS: {str(e)}")
|
93 |
-
time.sleep(random.uniform(1, 3))
|
94 |
-
raise Exception("Failed to initialize DuckDuckGo search after multiple attempts")
|
95 |
|
96 |
def safe_get(self, url: str, max_retries: int = 3) -> requests.Response:
|
97 |
"""Make a GET request with retries and error handling"""
|
@@ -172,58 +163,67 @@ class WebSearchEngine:
|
|
172 |
|
173 |
except Exception as e:
|
174 |
return {'error': f"Error processing {url}: {str(e)}"}
|
175 |
-
|
176 |
-
def
|
177 |
-
"""
|
|
|
|
|
178 |
try:
|
179 |
-
#
|
180 |
-
|
181 |
-
|
|
|
|
|
182 |
|
183 |
-
#
|
184 |
-
|
|
|
185 |
|
186 |
-
#
|
187 |
-
|
188 |
-
retry_count = 0
|
189 |
|
190 |
-
|
191 |
try:
|
192 |
-
#
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
break
|
197 |
-
|
198 |
-
results_gen = self.ddgs.text(
|
199 |
-
query,
|
200 |
-
region=region,
|
201 |
-
max_results=max_results - len(search_results)
|
202 |
-
)
|
203 |
-
|
204 |
-
for result in results_gen:
|
205 |
-
if len(search_results) >= max_results:
|
206 |
-
break
|
207 |
-
if result and isinstance(result, dict) and 'link' in result:
|
208 |
-
search_results.append(result)
|
209 |
-
time.sleep(random.uniform(0.2, 0.5))
|
210 |
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
if search_results:
|
215 |
-
break
|
216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
except Exception as e:
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
|
|
|
|
|
|
|
|
|
|
227 |
|
228 |
if not search_results:
|
229 |
return {'error': 'No results found'}
|
|
|
1 |
from typing import Dict, List, Any
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
|
|
4 |
from transformers import pipeline
|
5 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
6 |
import time
|
7 |
import json
|
8 |
import os
|
9 |
+
from urllib.parse import urlparse, quote_plus
|
10 |
import logging
|
11 |
import random
|
12 |
|
|
|
75 |
self.request_delay = 2.0
|
76 |
self.last_request_time = 0
|
77 |
self.max_retries = 3
|
|
|
78 |
self.headers = {
|
79 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
80 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
81 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
82 |
+
'DNT': '1',
|
83 |
+
'Connection': 'keep-alive',
|
84 |
+
'Upgrade-Insecure-Requests': '1'
|
85 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
def safe_get(self, url: str, max_retries: int = 3) -> requests.Response:
|
88 |
"""Make a GET request with retries and error handling"""
|
|
|
163 |
|
164 |
except Exception as e:
|
165 |
return {'error': f"Error processing {url}: {str(e)}"}
|
166 |
+
|
167 |
+
def search_duckduckgo(self, query: str, max_results: int = 5) -> List[Dict]:
|
168 |
+
"""Search DuckDuckGo and parse HTML results"""
|
169 |
+
search_results = []
|
170 |
+
|
171 |
try:
|
172 |
+
# Encode query for URL
|
173 |
+
encoded_query = quote_plus(query)
|
174 |
+
|
175 |
+
# DuckDuckGo HTML search URL
|
176 |
+
search_url = f'https://html.duckduckgo.com/html/?q={encoded_query}'
|
177 |
|
178 |
+
# Get search results page
|
179 |
+
response = self.safe_get(search_url)
|
180 |
+
soup = BeautifulSoup(response.text, 'lxml')
|
181 |
|
182 |
+
# Find all result elements
|
183 |
+
results = soup.find_all('div', {'class': 'result'})
|
|
|
184 |
|
185 |
+
for result in results[:max_results]:
|
186 |
try:
|
187 |
+
# Extract link
|
188 |
+
link_elem = result.find('a', {'class': 'result__a'})
|
189 |
+
if not link_elem:
|
190 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
+
link = link_elem.get('href', '')
|
193 |
+
if not link or not self.is_valid_url(link):
|
194 |
+
continue
|
|
|
|
|
195 |
|
196 |
+
# Extract title
|
197 |
+
title = link_elem.get_text(strip=True)
|
198 |
+
|
199 |
+
# Extract snippet
|
200 |
+
snippet_elem = result.find('a', {'class': 'result__snippet'})
|
201 |
+
snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
|
202 |
+
|
203 |
+
search_results.append({
|
204 |
+
'link': link,
|
205 |
+
'title': title,
|
206 |
+
'snippet': snippet
|
207 |
+
})
|
208 |
+
|
209 |
+
# Add delay between processing results
|
210 |
+
time.sleep(random.uniform(0.2, 0.5))
|
211 |
+
|
212 |
except Exception as e:
|
213 |
+
logger.warning(f"Error processing search result: {str(e)}")
|
214 |
+
continue
|
215 |
+
|
216 |
+
return search_results
|
217 |
+
|
218 |
+
except Exception as e:
|
219 |
+
logger.error(f"Error during DuckDuckGo search: {str(e)}")
|
220 |
+
return []
|
221 |
+
|
222 |
+
def search(self, query: str, max_results: int = 5) -> Dict:
|
223 |
+
"""Perform search and process results"""
|
224 |
+
try:
|
225 |
+
# Search using DuckDuckGo HTML
|
226 |
+
search_results = self.search_duckduckgo(query, max_results)
|
227 |
|
228 |
if not search_results:
|
229 |
return {'error': 'No results found'}
|