fikird commited on
Commit
5e3672b
·
1 Parent(s): 6c83b94

Improve search reliability with multiple regions and better error handling

Browse files
Files changed (1) hide show
  1. search_engine.py +71 -22
search_engine.py CHANGED
@@ -13,6 +13,12 @@ import random
13
 
14
  logger = logging.getLogger(__name__)
15
 
 
 
 
 
 
 
16
  class ModelManager:
17
  """Manages different AI models for specific tasks"""
18
 
@@ -67,10 +73,13 @@ class WebSearchEngine:
67
  def __init__(self):
68
  self.processor = ContentProcessor()
69
  self.session = requests.Session()
70
- self.request_delay = 2.0 # Increased delay between requests
71
  self.last_request_time = 0
72
  self.max_retries = 3
73
  self.ddgs = None
 
 
 
74
  self.initialize_search()
75
 
76
  def initialize_search(self):
@@ -83,6 +92,33 @@ class WebSearchEngine:
83
  logger.error(f"Error initializing DDGS: {str(e)}")
84
  time.sleep(random.uniform(1, 3))
85
  raise Exception("Failed to initialize DuckDuckGo search after multiple attempts")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  def is_valid_url(self, url: str) -> bool:
88
  """Check if URL is valid for crawling"""
@@ -109,19 +145,7 @@ class WebSearchEngine:
109
  return {'error': f"Invalid URL: {url}"}
110
 
111
  try:
112
- # Rate limiting with random delay
113
- current_time = time.time()
114
- time_since_last = current_time - self.last_request_time
115
- if time_since_last < self.request_delay:
116
- delay = self.request_delay - time_since_last + random.uniform(0.5, 1.5)
117
- time.sleep(delay)
118
-
119
- response = self.session.get(url, timeout=10)
120
- self.last_request_time = time.time()
121
-
122
- if response.status_code != 200:
123
- return {'error': f"Failed to fetch URL: {url}, status code: {response.status_code}"}
124
-
125
  soup = BeautifulSoup(response.text, 'lxml')
126
 
127
  # Extract text content
@@ -163,17 +187,40 @@ class WebSearchEngine:
163
  search_results = []
164
  retry_count = 0
165
 
166
- while retry_count < self.max_retries:
167
  try:
168
- for result in self.ddgs.text(query, max_results=max_results):
169
- search_results.append(result)
170
- # Add small delay between results
171
- time.sleep(random.uniform(0.2, 0.5))
172
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  except Exception as e:
174
  retry_count += 1
175
  if retry_count >= self.max_retries:
176
- return {'error': f"Search failed after {self.max_retries} attempts: {str(e)}"}
 
 
 
177
  logger.warning(f"Search attempt {retry_count} failed: {str(e)}")
178
  time.sleep(random.uniform(2, 5))
179
  self.initialize_search()
@@ -187,9 +234,11 @@ class WebSearchEngine:
187
  processed = self.process_url(result['link'])
188
  if 'error' not in processed:
189
  results.append(processed)
190
- # Add delay between processing URLs
191
  time.sleep(random.uniform(0.5, 1.0))
192
 
 
 
 
193
  # Generate insights from results
194
  all_content = " ".join([r['summary'] for r in results if 'summary' in r])
195
 
 
13
 
14
  logger = logging.getLogger(__name__)
15
 
16
+ class SearchResult:
17
+ def __init__(self, title: str, link: str, snippet: str):
18
+ self.title = title
19
+ self.link = link
20
+ self.snippet = snippet
21
+
22
  class ModelManager:
23
  """Manages different AI models for specific tasks"""
24
 
 
73
  def __init__(self):
74
  self.processor = ContentProcessor()
75
  self.session = requests.Session()
76
+ self.request_delay = 2.0
77
  self.last_request_time = 0
78
  self.max_retries = 3
79
  self.ddgs = None
80
+ self.headers = {
81
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
82
+ }
83
  self.initialize_search()
84
 
85
  def initialize_search(self):
 
92
  logger.error(f"Error initializing DDGS: {str(e)}")
93
  time.sleep(random.uniform(1, 3))
94
  raise Exception("Failed to initialize DuckDuckGo search after multiple attempts")
95
+
96
+ def safe_get(self, url: str, max_retries: int = 3) -> requests.Response:
97
+ """Make a GET request with retries and error handling"""
98
+ for i in range(max_retries):
99
+ try:
100
+ # Add delay between requests
101
+ current_time = time.time()
102
+ time_since_last = current_time - self.last_request_time
103
+ if time_since_last < self.request_delay:
104
+ time.sleep(self.request_delay - time_since_last + random.uniform(0.5, 1.5))
105
+
106
+ response = self.session.get(url, headers=self.headers, timeout=10)
107
+ self.last_request_time = time.time()
108
+
109
+ if response.status_code == 200:
110
+ return response
111
+ elif response.status_code == 429: # Rate limit
112
+ wait_time = (i + 1) * 5
113
+ time.sleep(wait_time)
114
+ continue
115
+ else:
116
+ response.raise_for_status()
117
+ except Exception as e:
118
+ if i == max_retries - 1:
119
+ raise
120
+ time.sleep((i + 1) * 2)
121
+ raise Exception(f"Failed to fetch URL after {max_retries} attempts")
122
 
123
  def is_valid_url(self, url: str) -> bool:
124
  """Check if URL is valid for crawling"""
 
145
  return {'error': f"Invalid URL: {url}"}
146
 
147
  try:
148
+ response = self.safe_get(url)
 
 
 
 
 
 
 
 
 
 
 
 
149
  soup = BeautifulSoup(response.text, 'lxml')
150
 
151
  # Extract text content
 
187
  search_results = []
188
  retry_count = 0
189
 
190
+ while retry_count < self.max_retries and len(search_results) < max_results:
191
  try:
192
+ # Try different regions if search fails
193
+ regions = ['wt-wt', 'us-en', 'uk-en']
194
+ for region in regions:
195
+ if len(search_results) >= max_results:
196
+ break
197
+
198
+ results_gen = self.ddgs.text(
199
+ query,
200
+ region=region,
201
+ max_results=max_results - len(search_results)
202
+ )
203
+
204
+ for result in results_gen:
205
+ if len(search_results) >= max_results:
206
+ break
207
+ if result and isinstance(result, dict) and 'link' in result:
208
+ search_results.append(result)
209
+ time.sleep(random.uniform(0.2, 0.5))
210
+
211
+ if search_results:
212
+ break
213
+
214
+ if search_results:
215
+ break
216
+
217
  except Exception as e:
218
  retry_count += 1
219
  if retry_count >= self.max_retries:
220
+ logger.error(f"Search failed after {self.max_retries} attempts: {str(e)}")
221
+ if not search_results:
222
+ return {'error': f"Search failed after {self.max_retries} attempts: {str(e)}"}
223
+ break
224
  logger.warning(f"Search attempt {retry_count} failed: {str(e)}")
225
  time.sleep(random.uniform(2, 5))
226
  self.initialize_search()
 
234
  processed = self.process_url(result['link'])
235
  if 'error' not in processed:
236
  results.append(processed)
 
237
  time.sleep(random.uniform(0.5, 1.0))
238
 
239
+ if not results:
240
+ return {'error': 'Failed to process any search results'}
241
+
242
  # Generate insights from results
243
  all_content = " ".join([r['summary'] for r in results if 'summary' in r])
244