fikird commited on
Commit
dd884bf
·
1 Parent(s): 5e3672b

Switch to HTML-based DuckDuckGo search for better reliability

Browse files
Files changed (1) hide show
  1. search_engine.py +60 -60
search_engine.py CHANGED
@@ -1,13 +1,12 @@
1
  from typing import Dict, List, Any
2
  import requests
3
  from bs4 import BeautifulSoup
4
- from duckduckgo_search import DDGS
5
  from transformers import pipeline
6
  from langchain_community.embeddings import HuggingFaceEmbeddings
7
  import time
8
  import json
9
  import os
10
- from urllib.parse import urlparse
11
  import logging
12
  import random
13
 
@@ -76,22 +75,14 @@ class WebSearchEngine:
76
  self.request_delay = 2.0
77
  self.last_request_time = 0
78
  self.max_retries = 3
79
- self.ddgs = None
80
  self.headers = {
81
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
 
 
 
 
82
  }
83
- self.initialize_search()
84
-
85
- def initialize_search(self):
86
- """Initialize DuckDuckGo search with retries"""
87
- for _ in range(self.max_retries):
88
- try:
89
- self.ddgs = DDGS()
90
- return
91
- except Exception as e:
92
- logger.error(f"Error initializing DDGS: {str(e)}")
93
- time.sleep(random.uniform(1, 3))
94
- raise Exception("Failed to initialize DuckDuckGo search after multiple attempts")
95
 
96
  def safe_get(self, url: str, max_retries: int = 3) -> requests.Response:
97
  """Make a GET request with retries and error handling"""
@@ -172,58 +163,67 @@ class WebSearchEngine:
172
 
173
  except Exception as e:
174
  return {'error': f"Error processing {url}: {str(e)}"}
175
-
176
- def search(self, query: str, max_results: int = 5) -> Dict:
177
- """Perform search and process results"""
 
 
178
  try:
179
- # Initialize search if needed
180
- if self.ddgs is None:
181
- self.initialize_search()
 
 
182
 
183
- # Add delay before search
184
- time.sleep(random.uniform(1, 2))
 
185
 
186
- # Search using DuckDuckGo with retries
187
- search_results = []
188
- retry_count = 0
189
 
190
- while retry_count < self.max_retries and len(search_results) < max_results:
191
  try:
192
- # Try different regions if search fails
193
- regions = ['wt-wt', 'us-en', 'uk-en']
194
- for region in regions:
195
- if len(search_results) >= max_results:
196
- break
197
-
198
- results_gen = self.ddgs.text(
199
- query,
200
- region=region,
201
- max_results=max_results - len(search_results)
202
- )
203
-
204
- for result in results_gen:
205
- if len(search_results) >= max_results:
206
- break
207
- if result and isinstance(result, dict) and 'link' in result:
208
- search_results.append(result)
209
- time.sleep(random.uniform(0.2, 0.5))
210
 
211
- if search_results:
212
- break
213
-
214
- if search_results:
215
- break
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  except Exception as e:
218
- retry_count += 1
219
- if retry_count >= self.max_retries:
220
- logger.error(f"Search failed after {self.max_retries} attempts: {str(e)}")
221
- if not search_results:
222
- return {'error': f"Search failed after {self.max_retries} attempts: {str(e)}"}
223
- break
224
- logger.warning(f"Search attempt {retry_count} failed: {str(e)}")
225
- time.sleep(random.uniform(2, 5))
226
- self.initialize_search()
 
 
 
 
 
227
 
228
  if not search_results:
229
  return {'error': 'No results found'}
 
1
  from typing import Dict, List, Any
2
  import requests
3
  from bs4 import BeautifulSoup
 
4
  from transformers import pipeline
5
  from langchain_community.embeddings import HuggingFaceEmbeddings
6
  import time
7
  import json
8
  import os
9
+ from urllib.parse import urlparse, quote_plus
10
  import logging
11
  import random
12
 
 
75
  self.request_delay = 2.0
76
  self.last_request_time = 0
77
  self.max_retries = 3
 
78
  self.headers = {
79
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
80
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
81
+ 'Accept-Language': 'en-US,en;q=0.5',
82
+ 'DNT': '1',
83
+ 'Connection': 'keep-alive',
84
+ 'Upgrade-Insecure-Requests': '1'
85
  }
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  def safe_get(self, url: str, max_retries: int = 3) -> requests.Response:
88
  """Make a GET request with retries and error handling"""
 
163
 
164
  except Exception as e:
165
  return {'error': f"Error processing {url}: {str(e)}"}
166
+
167
+ def search_duckduckgo(self, query: str, max_results: int = 5) -> List[Dict]:
168
+ """Search DuckDuckGo and parse HTML results"""
169
+ search_results = []
170
+
171
  try:
172
+ # Encode query for URL
173
+ encoded_query = quote_plus(query)
174
+
175
+ # DuckDuckGo HTML search URL
176
+ search_url = f'https://html.duckduckgo.com/html/?q={encoded_query}'
177
 
178
+ # Get search results page
179
+ response = self.safe_get(search_url)
180
+ soup = BeautifulSoup(response.text, 'lxml')
181
 
182
+ # Find all result elements
183
+ results = soup.find_all('div', {'class': 'result'})
 
184
 
185
+ for result in results[:max_results]:
186
  try:
187
+ # Extract link
188
+ link_elem = result.find('a', {'class': 'result__a'})
189
+ if not link_elem:
190
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
+ link = link_elem.get('href', '')
193
+ if not link or not self.is_valid_url(link):
194
+ continue
 
 
195
 
196
+ # Extract title
197
+ title = link_elem.get_text(strip=True)
198
+
199
+ # Extract snippet
200
+ snippet_elem = result.find('a', {'class': 'result__snippet'})
201
+ snippet = snippet_elem.get_text(strip=True) if snippet_elem else ""
202
+
203
+ search_results.append({
204
+ 'link': link,
205
+ 'title': title,
206
+ 'snippet': snippet
207
+ })
208
+
209
+ # Add delay between processing results
210
+ time.sleep(random.uniform(0.2, 0.5))
211
+
212
  except Exception as e:
213
+ logger.warning(f"Error processing search result: {str(e)}")
214
+ continue
215
+
216
+ return search_results
217
+
218
+ except Exception as e:
219
+ logger.error(f"Error during DuckDuckGo search: {str(e)}")
220
+ return []
221
+
222
+ def search(self, query: str, max_results: int = 5) -> Dict:
223
+ """Perform search and process results"""
224
+ try:
225
+ # Search using DuckDuckGo HTML
226
+ search_results = self.search_duckduckgo(query, max_results)
227
 
228
  if not search_results:
229
  return {'error': 'No results found'}