fikird commited on
Commit
d7b6953
Β·
1 Parent(s): a4caf5b

Update duckduckgo-search implementation and fix imports

Browse files
Files changed (2) hide show
  1. README.md +11 -1
  2. search_engine.py +52 -59
README.md CHANGED
@@ -1,3 +1,13 @@
 
 
 
 
 
 
 
 
 
 
1
  # πŸ” Intelligent Web Search Engine
2
 
3
  An advanced AI-powered search engine that provides deep understanding of web content, code analysis, and intelligent insights.
@@ -77,4 +87,4 @@ The search engine uses several AI models:
77
 
78
  ## 🀝 Contributing
79
 
80
- Contributions are welcome! Please feel free to submit a Pull Request.
 
1
+ ---
2
+ license: mit
3
+ title: crawling rag
4
+ sdk: gradio
5
+ emoji: πŸ‘
6
+ colorFrom: gray
7
+ colorTo: purple
8
+ short_description: a rag that can crawle website
9
+ sdk_version: 5.7.1
10
+ ---
11
  # πŸ” Intelligent Web Search Engine
12
 
13
  An advanced AI-powered search engine that provides deep understanding of web content, code analysis, and intelligent insights.
 
87
 
88
  ## 🀝 Contributing
89
 
90
+ Contributions are welcome! Please feel free to submit a Pull Request.
search_engine.py CHANGED
@@ -1,9 +1,9 @@
1
  from typing import Dict, List, Any
2
  import requests
3
  from bs4 import BeautifulSoup
4
- from duckduckgo_search import ddg
5
  from transformers import pipeline
6
- from langchain.embeddings import HuggingFaceEmbeddings
7
  import time
8
  import json
9
  import os
@@ -49,15 +49,12 @@ class ContentProcessor:
49
 
50
  return {
51
  'summary': summary,
52
- 'content_type': 'text',
53
- 'explanation': summary
54
  }
55
  except Exception as e:
56
- print(f"Error processing content: {str(e)}")
57
  return {
58
- 'summary': content[:200] + "...",
59
- 'content_type': 'text',
60
- 'explanation': "Unable to generate detailed analysis."
61
  }
62
 
63
  class WebSearchEngine:
@@ -68,104 +65,100 @@ class WebSearchEngine:
68
  self.session = requests.Session()
69
  self.request_delay = 1.0
70
  self.last_request_time = 0
71
-
 
72
  def is_valid_url(self, url: str) -> bool:
73
  """Check if URL is valid for crawling"""
74
  try:
75
  parsed = urlparse(url)
76
- return bool(parsed.netloc and parsed.scheme in ['http', 'https'])
77
  except:
78
  return False
79
 
80
  def get_metadata(self, soup: BeautifulSoup) -> Dict:
81
  """Extract metadata from page"""
82
- title = soup.title.string if soup.title else ""
83
  description = ""
84
  if soup.find("meta", attrs={"name": "description"}):
85
  description = soup.find("meta", attrs={"name": "description"}).get("content", "")
86
-
87
  return {
88
- "title": title,
89
- "description": description
90
  }
91
 
92
  def process_url(self, url: str) -> Dict:
93
  """Process a single URL"""
 
 
 
94
  try:
95
- # Respect rate limiting
96
  current_time = time.time()
97
- if current_time - self.last_request_time < self.request_delay:
98
- time.sleep(self.request_delay - (current_time - self.last_request_time))
 
99
 
100
  response = self.session.get(url, timeout=10)
101
  self.last_request_time = time.time()
102
 
103
- if not response.ok:
104
- return None
105
-
106
  soup = BeautifulSoup(response.text, 'lxml')
107
- metadata = self.get_metadata(soup)
108
 
109
- # Extract main content
110
- content = ' '.join([p.get_text() for p in soup.find_all('p')])
 
 
 
 
 
111
 
112
- if not content:
113
- return None
114
 
115
- processed_content = self.processor.process_content(content)
116
- processed_content['metadata'] = metadata
117
 
118
  return {
119
  'url': url,
120
  'title': metadata['title'],
121
- 'snippet': content[:200] + "...",
122
- 'processed_content': processed_content
 
123
  }
124
 
125
  except Exception as e:
126
- print(f"Error processing {url}: {str(e)}")
127
- return None
128
 
129
  def search(self, query: str, max_results: int = 5) -> Dict:
130
  """Perform search and process results"""
131
  try:
132
  # Search using DuckDuckGo
133
- search_results = ddg(query, max_results=max_results)
134
 
135
- # Process results
136
- processed_results = []
137
  for result in search_results:
138
- if self.is_valid_url(result['link']):
139
  processed = self.process_url(result['link'])
140
- if processed:
141
- processed_results.append(processed)
142
-
143
- # Generate insights
144
- all_content = ' '.join([r['processed_content']['summary'] for r in processed_results if r])
145
- insights = self.processor.process_content(all_content)['summary']
146
-
147
- # Generate follow-up questions
148
- follow_up_questions = [
149
- f"What are the key differences between {query} and related topics?",
150
- f"How has {query} evolved over time?",
151
- f"What are the practical applications of {query}?"
152
- ]
153
 
154
  return {
155
- 'results': processed_results,
156
- 'insights': insights,
157
- 'follow_up_questions': follow_up_questions,
158
- 'similar_queries': []
 
 
 
159
  }
160
 
161
  except Exception as e:
162
- print(f"Error during search: {str(e)}")
163
- return {
164
- 'results': [],
165
- 'insights': f"Error performing search: {str(e)}",
166
- 'follow_up_questions': [],
167
- 'similar_queries': []
168
- }
169
 
170
  # Main search function
171
  def search(query: str, max_results: int = 5) -> Dict:
 
1
  from typing import Dict, List, Any
2
  import requests
3
  from bs4 import BeautifulSoup
4
+ from duckduckgo_search import DDGS
5
  from transformers import pipeline
6
+ from langchain_community.embeddings import HuggingFaceEmbeddings
7
  import time
8
  import json
9
  import os
 
49
 
50
  return {
51
  'summary': summary,
52
+ 'content': content
 
53
  }
54
  except Exception as e:
 
55
  return {
56
+ 'summary': f"Error processing content: {str(e)}",
57
+ 'content': content
 
58
  }
59
 
60
  class WebSearchEngine:
 
65
  self.session = requests.Session()
66
  self.request_delay = 1.0
67
  self.last_request_time = 0
68
+ self.ddgs = DDGS()
69
+
70
  def is_valid_url(self, url: str) -> bool:
71
  """Check if URL is valid for crawling"""
72
  try:
73
  parsed = urlparse(url)
74
+ return bool(parsed.netloc and parsed.scheme)
75
  except:
76
  return False
77
 
78
  def get_metadata(self, soup: BeautifulSoup) -> Dict:
79
  """Extract metadata from page"""
80
+ title = soup.title.string if soup.title else "No title"
81
  description = ""
82
  if soup.find("meta", attrs={"name": "description"}):
83
  description = soup.find("meta", attrs={"name": "description"}).get("content", "")
 
84
  return {
85
+ 'title': title,
86
+ 'description': description
87
  }
88
 
89
  def process_url(self, url: str) -> Dict:
90
  """Process a single URL"""
91
+ if not self.is_valid_url(url):
92
+ return {'error': f"Invalid URL: {url}"}
93
+
94
  try:
95
+ # Rate limiting
96
  current_time = time.time()
97
+ time_since_last = current_time - self.last_request_time
98
+ if time_since_last < self.request_delay:
99
+ time.sleep(self.request_delay - time_since_last)
100
 
101
  response = self.session.get(url, timeout=10)
102
  self.last_request_time = time.time()
103
 
104
+ if response.status_code != 200:
105
+ return {'error': f"Failed to fetch URL: {url}, status code: {response.status_code}"}
106
+
107
  soup = BeautifulSoup(response.text, 'lxml')
 
108
 
109
+ # Extract text content
110
+ for script in soup(["script", "style"]):
111
+ script.decompose()
112
+ text = soup.get_text()
113
+ lines = (line.strip() for line in text.splitlines())
114
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
115
+ content = ' '.join(chunk for chunk in chunks if chunk)
116
 
117
+ # Get metadata
118
+ metadata = self.get_metadata(soup)
119
 
120
+ # Process content
121
+ processed = self.processor.process_content(content)
122
 
123
  return {
124
  'url': url,
125
  'title': metadata['title'],
126
+ 'description': metadata['description'],
127
+ 'summary': processed['summary'],
128
+ 'content': processed['content']
129
  }
130
 
131
  except Exception as e:
132
+ return {'error': f"Error processing {url}: {str(e)}"}
 
133
 
134
  def search(self, query: str, max_results: int = 5) -> Dict:
135
  """Perform search and process results"""
136
  try:
137
  # Search using DuckDuckGo
138
+ search_results = list(self.ddgs.text(query, max_results=max_results))
139
 
140
+ results = []
 
141
  for result in search_results:
142
+ if 'link' in result:
143
  processed = self.process_url(result['link'])
144
+ if 'error' not in processed:
145
+ results.append(processed)
146
+
147
+ # Generate insights from results
148
+ all_content = " ".join([r['summary'] for r in results if 'summary' in r])
 
 
 
 
 
 
 
 
149
 
150
  return {
151
+ 'results': results,
152
+ 'insights': all_content[:1000] if all_content else "No insights available.",
153
+ 'follow_up_questions': [
154
+ f"What are the key differences between {query} and related topics?",
155
+ f"Can you explain {query} in simple terms?",
156
+ f"What are the latest developments in {query}?"
157
+ ]
158
  }
159
 
160
  except Exception as e:
161
+ return {'error': f"Search failed: {str(e)}"}
 
 
 
 
 
 
162
 
163
  # Main search function
164
  def search(query: str, max_results: int = 5) -> Dict: