KenTheNoob commited on
Commit
4779f10
·
1 Parent(s): 50e2c8d

Data collection improvement

Browse files
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,7 +1,7 @@
1
- ---
2
- language:
3
- - en
4
- base_model:
5
- - meta-llama/Llama-3.2-3B-Instruct
6
- ---
7
  This model enhances the performance of the ollama3.2 model in the ros2, nav2, movit2, and gazebo subdomains through RAG. See the project README.md file for more information on how to setup/run the model and how it works.
 
1
+ ---
2
+ language:
3
+ - en
4
+ base_model:
5
+ - meta-llama/Llama-3.2-3B-Instruct
6
+ ---
7
  This model enhances the performance of the ollama3.2 model in the ros2, nav2, movit2, and gazebo subdomains through RAG. See the project README.md file for more information on how to setup/run the model and how it works.
project/.gitignore CHANGED
@@ -1,2 +1,4 @@
1
  .gradio
2
- .env
 
 
 
1
  .gradio
2
+ .env
3
+ __pycache__
4
+ /tool/__pycache__
project/ClearML/DataCollectionPipeline.py CHANGED
@@ -12,6 +12,7 @@ import pymongo
12
  import requests
13
  from bs4 import BeautifulSoup
14
  from clearml import PipelineDecorator
 
15
  from dotenv import load_dotenv
16
 
17
  # Setup ClearML
@@ -26,6 +27,7 @@ CLEARML_API_ACCESS_KEY = os.getenv("CLEARML_API_ACCESS_KEY")
26
  CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
27
 
28
  # Input into the Data Collection Pipeline is a list of links to domains
 
29
  links = [
30
  "https://www.ros.org/",
31
  "https://docs.nav2.org/",
@@ -36,7 +38,8 @@ links = [
36
  "https://github.com/moveit/moveit2",
37
  "https://github.com/gazebosim/gazebo-classic",
38
  ]
39
- links = ["https://www.ros.org/", "https://github.com/ros2/ros2"]
 
40
 
41
 
42
  # ETL pipeline
@@ -123,33 +126,38 @@ def ETL_Pipeline(links):
123
  mongoCollection = mongoDatabase["Document"]
124
  result = mongoCollection.find_one({"link": link})
125
  if result is None:
126
- # Get all text in the website
127
- r = requests.get(link)
128
- soup = BeautifulSoup(r.content, "html.parser")
129
- soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"])
130
- text = soup.get_text()
131
- # Transform the data
132
- # Get rid of repeating \n characters and spaces
133
- text = text.replace("\t", " ")
134
- text = text.replace("\n", " ")
135
- text_len = len(text)
136
- for i in range(text_len):
137
- while i + 1 < text_len and text[i] == " " and text[i + 1] == " ":
138
- text = text[:i] + text[i + 1 :]
139
- text_len -= 1
140
- documents.append({"link": link, "type": "Document", "content": text})
141
- # Also crawl through all subdirectorys in the link(related links)
142
- soup = BeautifulSoup(r.content, "html.parser")
143
- subdirectories = [a.get("href") for a in soup.find_all("a")]
144
- for subdirectory in subdirectories:
145
- if (
146
- subdirectory is not None
147
- and mongoCollection.find_one({"link": link + subdirectory})
148
- is not None
149
- ):
150
- links.append(link + subdirectory)
 
 
 
 
 
151
  # Avoid spamming sites
152
- time.sleep(1)
153
  # Each document has a link, type(github or other), and content(text)
154
  mongoCollection = mongoDatabase["Document"]
155
  mongoCollection.insert_many(documents)
@@ -162,7 +170,7 @@ def ETL_Pipeline(links):
162
  @PipelineDecorator.pipeline(
163
  name="Data Collection Pipeline",
164
  project="RAG LLM",
165
- version="0.2",
166
  )
167
  def main():
168
  return ETL_Pipeline(links)
 
12
  import requests
13
  from bs4 import BeautifulSoup
14
  from clearml import PipelineDecorator
15
+ import urllib.parse
16
  from dotenv import load_dotenv
17
 
18
  # Setup ClearML
 
27
  CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
28
 
29
  # Input into the Data Collection Pipeline is a list of links to domains
30
+ """
31
  links = [
32
  "https://www.ros.org/",
33
  "https://docs.nav2.org/",
 
38
  "https://github.com/moveit/moveit2",
39
  "https://github.com/gazebosim/gazebo-classic",
40
  ]
41
+ """
42
+ links = [ "https://www.ros.org/", "https://github.com/ros2/ros2" ]
43
 
44
 
45
  # ETL pipeline
 
126
  mongoCollection = mongoDatabase["Document"]
127
  result = mongoCollection.find_one({"link": link})
128
  if result is None:
129
+ try:
130
+ # Get all text in the website
131
+ r = requests.get(link)
132
+ soup = BeautifulSoup(r.content, "html.parser")
133
+ soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"])
134
+ text = soup.get_text()
135
+ # Transform the data
136
+ # Get rid of repeating \n characters and spaces
137
+ text = text.replace("\t", " ")
138
+ text = text.replace("\n", " ")
139
+ text_len = len(text)
140
+ for i in range(text_len):
141
+ while i + 1 < text_len and text[i] == " " and text[i + 1] == " ":
142
+ text = text[:i] + text[i + 1 :]
143
+ text_len -= 1
144
+ if "404" not in text:
145
+ documents.append({"link": link, "type": "Document", "content": text})
146
+ # Also crawl through all subdirectorys in the link(related links)
147
+ soup = BeautifulSoup(r.content, "html.parser")
148
+ subdirectories = [a.get("href") for a in soup.find_all("a")]
149
+ for subdirectory in subdirectories:
150
+ newLink = urllib.parse.urljoin(link, subdirectory)
151
+ if (
152
+ subdirectory is not None and
153
+ 'http' not in subdirectory and
154
+ mongoCollection.find_one({"link": newLink}) is None
155
+ ):
156
+ links.append(newLink)
157
+ except:
158
+ print("Could not crawl link", link)
159
  # Avoid spamming sites
160
+ time.sleep(0.1)
161
  # Each document has a link, type(github or other), and content(text)
162
  mongoCollection = mongoDatabase["Document"]
163
  mongoCollection.insert_many(documents)
 
170
  @PipelineDecorator.pipeline(
171
  name="Data Collection Pipeline",
172
  project="RAG LLM",
173
+ version="0.4",
174
  )
175
  def main():
176
  return ETL_Pipeline(links)
project/ClearML/FeaturePipeline.py CHANGED
@@ -153,7 +153,7 @@ def storeEmbeddings(embeddings, links, resultTypes, chunks, chunkNums):
153
  @PipelineDecorator.pipeline(
154
  name="Feature Pipeline",
155
  project="RAG LLM",
156
- version="0.2",
157
  )
158
  def main():
159
  links, resultTypes, texts = retreiveDocuments()
 
153
  @PipelineDecorator.pipeline(
154
  name="Feature Pipeline",
155
  project="RAG LLM",
156
+ version="0.3",
157
  )
158
  def main():
159
  links, resultTypes, texts = retreiveDocuments()
project/ClearML/InferencePipeline.py CHANGED
@@ -143,7 +143,7 @@ def reranking(results):
143
  texts = [result.payload["text"] for result in results]
144
  topTexts = ""
145
  for index in topIndexes:
146
- topTexts += texts[index][0]
147
  return topTexts
148
 
149
 
 
143
  texts = [result.payload["text"] for result in results]
144
  topTexts = ""
145
  for index in topIndexes:
146
+ topTexts += texts[index]
147
  return topTexts
148
 
149
 
project/DataCollectionPipeline.ipynb CHANGED
@@ -2,13 +2,17 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [
8
  {
9
  "name": "stdout",
10
  "output_type": "stream",
11
  "text": [
 
 
 
 
12
  "Visiting link: https://github.com/ros2/ros2\n"
13
  ]
14
  },
@@ -23,22 +27,32 @@
23
  "name": "stdout",
24
  "output_type": "stream",
25
  "text": [
26
- "Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/README.md\n",
27
- "Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/.gitignore\n",
28
- "Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/CODEOWNERS\n",
29
- "Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/ros2.repos\n",
30
- "Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/src/.gitkeep\n"
31
  ]
32
  },
33
  {
34
- "data": {
35
- "text/plain": [
36
- "InsertManyResult([ObjectId('675531b926a728d5b045a2e6'), ObjectId('675531b926a728d5b045a2e7'), ObjectId('675531b926a728d5b045a2e8'), ObjectId('675531b926a728d5b045a2e9'), ObjectId('675531b926a728d5b045a2ea')], acknowledged=True)"
37
- ]
38
- },
39
- "execution_count": 1,
40
- "metadata": {},
41
- "output_type": "execute_result"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  }
43
  ],
44
  "source": [
@@ -52,26 +66,24 @@
52
  "import shutil\n",
53
  "import subprocess\n",
54
  "import tempfile\n",
 
55
  "from shared import getMongoClient\n",
56
  "\n",
57
  "# Input into the Data Collection Pipeline is a list of links to domains\n",
58
  "links = ['https://www.ros.org/','https://docs.nav2.org/','https://moveit.ai/','https://gazebosim.org/home', 'https://github.com/ros2/ros2', 'https://github.com/ros-navigation/navigation2', 'https://github.com/moveit/moveit2', 'https://github.com/gazebosim/gazebo-classic']\n",
59
- "links = ['https://www.ros.org/', 'https://github.com/ros2/ros2']\n",
60
  "\n",
61
  "# Create a mongoDB connection\n",
62
  "mongoHost = getMongoClient()\n",
63
  "mongoDatabase = mongoHost[\"twin\"]\n",
64
- "\n",
65
  "# ETL pipeline\n",
66
  "# Extract data from links and their subdirectories(using crawlers)\n",
67
- "documents = []\n",
68
  "codes = []\n",
69
  "for link in links:\n",
70
  " # Web scraper/crawler for github links\n",
71
  " if \"https://github.com\" in link:\n",
72
  " # Do not revisit a link already in the database\n",
73
  " mongoCollection = mongoDatabase[\"Github\"]\n",
74
- " result = mongoCollection.find_one({\"link\": link})\n",
75
  " if result is None:\n",
76
  " print(\"Visiting link: \", link)\n",
77
  " # Modified GithubCrawler from LLM-Engineer for scraping github\n",
@@ -106,9 +118,9 @@
106
  " path = path.rsplit(\"/\", 1)[0]\n",
107
  " # Push all the subdirectories to mongo\n",
108
  " for subdirectory in tree:\n",
109
- " print(\n",
110
- " f\"Adding subdirectory: https://github.com{path}/{subdirectory}\"\n",
111
- " )\n",
112
  " text = tree[subdirectory]\n",
113
  " # Transform the data\n",
114
  " # Get rid of repeating \\n characters and spaces\n",
@@ -129,6 +141,8 @@
129
  " \"content\": text,\n",
130
  " }\n",
131
  " )\n",
 
 
132
  " else:\n",
133
  " print(\"Already visited: \", link)\n",
134
  " # Web scraper/crawler for other links(Documents)\n",
@@ -137,42 +151,44 @@
137
  " mongoCollection = mongoDatabase[\"Document\"]\n",
138
  " result = mongoCollection.find_one({\"link\": link})\n",
139
  " if result is None:\n",
140
- " # Get all text in the website\n",
141
- " r = requests.get(link)\n",
142
- " soup = BeautifulSoup(r.content, \"html.parser\")\n",
143
- " soup.find_all([\"p\", \"h1\", \"h2\", \"h3\", \"h4\", \"h5\", \"h6\"])\n",
144
- " text = soup.get_text()\n",
145
- " # Transform the data\n",
146
- " # Get rid of repeating \\n characters and spaces\n",
147
- " text = text.replace(\"\\t\", \" \")\n",
148
- " text = text.replace(\"\\n\", \" \")\n",
149
- " text_len = len(text)\n",
150
- " for i in range(text_len):\n",
151
- " while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n",
152
- " text = text[:i] + text[i + 1 :]\n",
153
- " text_len -= 1\n",
154
- " documents.append({\"link\": link, \"type\": \"Document\", \"content\": text})\n",
155
- " # Also crawl through all subdirectorys in the link(related links)\n",
156
- " soup = BeautifulSoup(r.content, \"html.parser\")\n",
157
- " subdirectories = [a.get(\"href\") for a in soup.find_all(\"a\")]\n",
158
- " for subdirectory in subdirectories:\n",
159
- " if (\n",
160
- " subdirectory is not None\n",
161
- " and mongoCollection.find_one({\"link\": link + subdirectory})\n",
162
- " is not None\n",
163
- " ):\n",
164
- " print(\"Adding subdirectory: \", link + subdirectory)\n",
165
- " links.append(link + subdirectory)\n",
 
 
 
 
 
 
 
 
166
  " else:\n",
167
  " print(\"Already visited: \", link)\n",
168
  " # Avoid spamming sites\n",
169
- " time.sleep(1)\n",
170
- "# Each document has a link, type(github or other), and content(text)\n",
171
- "# You can go to Tools/mongoTools to view the inserted documents\n",
172
- "mongoCollection = mongoDatabase[\"Document\"]\n",
173
- "mongoCollection.insert_many(documents)\n",
174
- "mongoCollection = mongoDatabase[\"Github\"]\n",
175
- "mongoCollection.insert_many(codes)"
176
  ]
177
  }
178
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 10,
6
  "metadata": {},
7
  "outputs": [
8
  {
9
  "name": "stdout",
10
  "output_type": "stream",
11
  "text": [
12
+ "Visiting link: https://www.ros.org/\n",
13
+ "Visiting link: https://docs.nav2.org/\n",
14
+ "Visiting link: https://moveit.ai/\n",
15
+ "Visiting link: https://gazebosim.org/home\n",
16
  "Visiting link: https://github.com/ros2/ros2\n"
17
  ]
18
  },
 
27
  "name": "stdout",
28
  "output_type": "stream",
29
  "text": [
30
+ "Visiting link: https://github.com/ros-navigation/navigation2\n"
 
 
 
 
31
  ]
32
  },
33
  {
34
+ "name": "stderr",
35
+ "output_type": "stream",
36
+ "text": [
37
+ "Cloning into 'navigation2'...\n",
38
+ "fatal: early EOF\n"
39
+ ]
40
+ },
41
+ {
42
+ "ename": "KeyboardInterrupt",
43
+ "evalue": "",
44
+ "output_type": "error",
45
+ "traceback": [
46
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
47
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
48
+ "Cell \u001b[0;32mIn[10], line 35\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 34\u001b[0m os\u001b[38;5;241m.\u001b[39mchdir(local_temp)\n\u001b[0;32m---> 35\u001b[0m \u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgit\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mclone\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlink\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 36\u001b[0m repo_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(local_temp, os\u001b[38;5;241m.\u001b[39mlistdir(local_temp)[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m 37\u001b[0m tree \u001b[38;5;241m=\u001b[39m {}\n",
49
+ "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:550\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m 548\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Popen(\u001b[38;5;241m*\u001b[39mpopenargs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;28;01mas\u001b[39;00m process:\n\u001b[1;32m 549\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 550\u001b[0m stdout, stderr \u001b[38;5;241m=\u001b[39m \u001b[43mprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcommunicate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 551\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m TimeoutExpired \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 552\u001b[0m process\u001b[38;5;241m.\u001b[39mkill()\n",
50
+ "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:1201\u001b[0m, in \u001b[0;36mPopen.communicate\u001b[0;34m(self, input, timeout)\u001b[0m\n\u001b[1;32m 1199\u001b[0m stderr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m 1200\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m-> 1201\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1202\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
51
+ "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:1264\u001b[0m, in \u001b[0;36mPopen.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1262\u001b[0m endtime \u001b[38;5;241m=\u001b[39m _time() \u001b[38;5;241m+\u001b[39m timeout\n\u001b[1;32m 1263\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1264\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1265\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m 1266\u001b[0m \u001b[38;5;66;03m# https://bugs.python.org/issue25942\u001b[39;00m\n\u001b[1;32m 1267\u001b[0m \u001b[38;5;66;03m# The first keyboard interrupt waits briefly for the child to\u001b[39;00m\n\u001b[1;32m 1268\u001b[0m \u001b[38;5;66;03m# exit under the common assumption that it also received the ^C\u001b[39;00m\n\u001b[1;32m 1269\u001b[0m \u001b[38;5;66;03m# generated SIGINT and will exit rapidly.\u001b[39;00m\n\u001b[1;32m 1270\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
52
+ "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:2053\u001b[0m, in \u001b[0;36mPopen._wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 2051\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 2052\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m \u001b[38;5;66;03m# Another thread waited.\u001b[39;00m\n\u001b[0;32m-> 2053\u001b[0m (pid, sts) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_try_wait\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2054\u001b[0m \u001b[38;5;66;03m# Check the pid and loop as waitpid has been known to\u001b[39;00m\n\u001b[1;32m 2055\u001b[0m \u001b[38;5;66;03m# return 0 even without WNOHANG in odd situations.\u001b[39;00m\n\u001b[1;32m 2056\u001b[0m \u001b[38;5;66;03m# http://bugs.python.org/issue14396.\u001b[39;00m\n\u001b[1;32m 2057\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pid \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpid:\n",
53
+ "File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:2011\u001b[0m, in \u001b[0;36mPopen._try_wait\u001b[0;34m(self, wait_flags)\u001b[0m\n\u001b[1;32m 2009\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"All callers to this function MUST hold self._waitpid_lock.\"\"\"\u001b[39;00m\n\u001b[1;32m 2010\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 2011\u001b[0m (pid, sts) \u001b[38;5;241m=\u001b[39m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwaitpid\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwait_flags\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2012\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mChildProcessError\u001b[39;00m:\n\u001b[1;32m 2013\u001b[0m \u001b[38;5;66;03m# This happens if SIGCLD is set to be ignored or waiting\u001b[39;00m\n\u001b[1;32m 2014\u001b[0m \u001b[38;5;66;03m# for child processes has otherwise been disabled for our\u001b[39;00m\n\u001b[1;32m 2015\u001b[0m \u001b[38;5;66;03m# process. This child is dead, we can't get the status.\u001b[39;00m\n\u001b[1;32m 2016\u001b[0m pid \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpid\n",
54
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
55
+ ]
56
  }
57
  ],
58
  "source": [
 
66
  "import shutil\n",
67
  "import subprocess\n",
68
  "import tempfile\n",
69
+ "import urllib.parse\n",
70
  "from shared import getMongoClient\n",
71
  "\n",
72
  "# Input into the Data Collection Pipeline is a list of links to domains\n",
73
  "links = ['https://www.ros.org/','https://docs.nav2.org/','https://moveit.ai/','https://gazebosim.org/home', 'https://github.com/ros2/ros2', 'https://github.com/ros-navigation/navigation2', 'https://github.com/moveit/moveit2', 'https://github.com/gazebosim/gazebo-classic']\n",
 
74
  "\n",
75
  "# Create a mongoDB connection\n",
76
  "mongoHost = getMongoClient()\n",
77
  "mongoDatabase = mongoHost[\"twin\"]\n",
 
78
  "# ETL pipeline\n",
79
  "# Extract data from links and their subdirectories(using crawlers)\n",
 
80
  "codes = []\n",
81
  "for link in links:\n",
82
  " # Web scraper/crawler for github links\n",
83
  " if \"https://github.com\" in link:\n",
84
  " # Do not revisit a link already in the database\n",
85
  " mongoCollection = mongoDatabase[\"Github\"]\n",
86
+ " result = mongoCollection.find_one({\"link\": {\"$regex\" : link}})\n",
87
  " if result is None:\n",
88
  " print(\"Visiting link: \", link)\n",
89
  " # Modified GithubCrawler from LLM-Engineer for scraping github\n",
 
118
  " path = path.rsplit(\"/\", 1)[0]\n",
119
  " # Push all the subdirectories to mongo\n",
120
  " for subdirectory in tree:\n",
121
+ " #print(\n",
122
+ " # f\"Adding subdirectory: https://github.com{path}/{subdirectory}\"\n",
123
+ " #)\n",
124
  " text = tree[subdirectory]\n",
125
  " # Transform the data\n",
126
  " # Get rid of repeating \\n characters and spaces\n",
 
141
  " \"content\": text,\n",
142
  " }\n",
143
  " )\n",
144
+ " mongoCollection.insert_many(codes)\n",
145
+ " codes = []\n",
146
  " else:\n",
147
  " print(\"Already visited: \", link)\n",
148
  " # Web scraper/crawler for other links(Documents)\n",
 
151
  " mongoCollection = mongoDatabase[\"Document\"]\n",
152
  " result = mongoCollection.find_one({\"link\": link})\n",
153
  " if result is None:\n",
154
+ " print(\"Visiting link: \", link)\n",
155
+ " try:\n",
156
+ " # Get all text in the website\n",
157
+ " r = requests.get(link)\n",
158
+ " soup = BeautifulSoup(r.content, \"html.parser\")\n",
159
+ " soup.find_all([\"p\", \"h1\", \"h2\", \"h3\", \"h4\", \"h5\", \"h6\"])\n",
160
+ " text = soup.get_text()\n",
161
+ " # Transform the data\n",
162
+ " # Get rid of repeating \\n characters and spaces\n",
163
+ " text = text.replace(\"\\t\", \" \")\n",
164
+ " text = text.replace(\"\\n\", \" \")\n",
165
+ " text_len = len(text)\n",
166
+ " for i in range(text_len):\n",
167
+ " while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n",
168
+ " text = text[:i] + text[i + 1 :]\n",
169
+ " text_len -= 1\n",
170
+ " if \"404\" not in text:\n",
171
+ " mongoCollection.insert_one({\"link\": link, \"type\": \"Document\", \"content\": text})\n",
172
+ " else:\n",
173
+ " print(\"Page not found: \", link)\n",
174
+ " # Also crawl through all subdirectorys in the link(related links)\n",
175
+ " soup = BeautifulSoup(r.content, \"html.parser\")\n",
176
+ " subdirectories = [a.get(\"href\") for a in soup.find_all(\"a\")]\n",
177
+ " for subdirectory in subdirectories:\n",
178
+ " newLink = urllib.parse.urljoin(link, subdirectory)\n",
179
+ " if (\n",
180
+ " subdirectory is not None and\n",
181
+ " 'http' not in subdirectory and\n",
182
+ " mongoCollection.find_one({\"link\": newLink}) is None\n",
183
+ " ):\n",
184
+ " #print(\"Adding subdirectory: \", link + subdirectory)\n",
185
+ " links.append(newLink)\n",
186
+ " except:\n",
187
+ " print(\"Could not crawl link\", link)\n",
188
  " else:\n",
189
  " print(\"Already visited: \", link)\n",
190
  " # Avoid spamming sites\n",
191
+ " time.sleep(0.1)"
 
 
 
 
 
 
192
  ]
193
  }
194
  ],
project/FeaturePipeline.ipynb CHANGED
@@ -2,15 +2,33 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [
8
  {
9
- "name": "stderr",
10
- "output_type": "stream",
11
- "text": [
12
- "/workspaces/RAG_LLM/project/shared.py:57: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the :class:`~langchain-ollama package and should be used instead. To use it run `pip install -U :class:`~langchain-ollama` and import as `from :class:`~langchain_ollama import OllamaEmbeddings``.\n",
13
- " return OllamaEmbeddings(model=MODEL, base_url=\"http://host.docker.internal:11434\")\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  ]
15
  }
16
  ],
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 15,
6
  "metadata": {},
7
  "outputs": [
8
  {
9
+ "ename": "KeyboardInterrupt",
10
+ "evalue": "",
11
+ "output_type": "error",
12
+ "traceback": [
13
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
14
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
15
+ "Cell \u001b[0;32mIn[15], line 61\u001b[0m\n\u001b[1;32m 58\u001b[0m chunkNum \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m chunks:\n\u001b[1;32m 60\u001b[0m \u001b[38;5;66;03m# Create embeddings for each chunk, of length 2048 using the embedding model\u001b[39;00m\n\u001b[0;32m---> 61\u001b[0m embedding \u001b[38;5;241m=\u001b[39m \u001b[43membeddingsModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membed_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;66;03m# Store the embedding along with some metadata into the Qdrant vector database\u001b[39;00m\n\u001b[1;32m 63\u001b[0m qClient\u001b[38;5;241m.\u001b[39mupsert(collection_name\u001b[38;5;241m=\u001b[39mresultType, wait\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, points\u001b[38;5;241m=\u001b[39m[PointStruct(\u001b[38;5;28mid\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mid\u001b[39m, vector\u001b[38;5;241m=\u001b[39membedding, payload\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlink\u001b[39m\u001b[38;5;124m\"\u001b[39m: link, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: resultType, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mchunk\u001b[39m\u001b[38;5;124m\"\u001b[39m: chunkNum, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m: chunk})])\n",
16
+ "File \u001b[0;32m/usr/local/lib/python3.12/site-packages/langchain_community/embeddings/ollama.py:227\u001b[0m, in \u001b[0;36mOllamaEmbeddings.embed_query\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Embed a query using a Ollama deployed embedding model.\u001b[39;00m\n\u001b[1;32m 219\u001b[0m \n\u001b[1;32m 220\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[38;5;124;03m Embeddings for the text.\u001b[39;00m\n\u001b[1;32m 225\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 226\u001b[0m instruction_pair \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mquery_instruction\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mtext\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 227\u001b[0m embedding \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_embed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43minstruction_pair\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 228\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m embedding\n",
17
+ "File \u001b[0;32m/usr/local/lib/python3.12/site-packages/langchain_community/embeddings/ollama.py:202\u001b[0m, in \u001b[0;36mOllamaEmbeddings._embed\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 200\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 201\u001b[0m iter_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28minput\u001b[39m\n\u001b[0;32m--> 202\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_process_emb_response\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m prompt \u001b[38;5;129;01min\u001b[39;00m iter_]\n",
18
+ "File \u001b[0;32m/usr/local/lib/python3.12/site-packages/langchain_community/embeddings/ollama.py:167\u001b[0m, in \u001b[0;36mOllamaEmbeddings._process_emb_response\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 161\u001b[0m headers \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 162\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mContent-Type\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mapplication/json\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 163\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mheaders \u001b[38;5;129;01mor\u001b[39;00m {}),\n\u001b[1;32m 164\u001b[0m }\n\u001b[1;32m 166\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mrequests\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpost\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 168\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbase_url\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m/api/embeddings\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 169\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 170\u001b[0m \u001b[43m \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mprompt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_default_params\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 171\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m requests\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mRequestException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError raised by inference endpoint: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
19
+ "File \u001b[0;32m/usr/local/lib/python3.12/site-packages/requests/api.py:115\u001b[0m, in \u001b[0;36mpost\u001b[0;34m(url, data, json, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpost\u001b[39m(url, data\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, json\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 104\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Sends a POST request.\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \n\u001b[1;32m 106\u001b[0m \u001b[38;5;124;03m :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;124;03m :rtype: requests.Response\u001b[39;00m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 115\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpost\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
20
+ "File \u001b[0;32m/usr/local/lib/python3.12/site-packages/requests/api.py:59\u001b[0m, in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;66;03m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[1;32m 56\u001b[0m \u001b[38;5;66;03m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m sessions\u001b[38;5;241m.\u001b[39mSession() \u001b[38;5;28;01mas\u001b[39;00m session:\n\u001b[0;32m---> 59\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
21
+ "File \u001b[0;32m/usr/local/lib/python3.12/site-packages/requests/sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 584\u001b[0m send_kwargs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 585\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m: timeout,\n\u001b[1;32m 586\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m: allow_redirects,\n\u001b[1;32m 587\u001b[0m }\n\u001b[1;32m 588\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 589\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n",
22
+ "File \u001b[0;32m/usr/local/lib/python3.12/site-packages/requests/sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 700\u001b[0m start \u001b[38;5;241m=\u001b[39m preferred_clock()\n\u001b[1;32m 702\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43madapter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 705\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[1;32m 706\u001b[0m elapsed \u001b[38;5;241m=\u001b[39m preferred_clock() \u001b[38;5;241m-\u001b[39m start\n",
23
+ "File \u001b[0;32m/usr/local/lib/python3.12/site-packages/requests/adapters.py:667\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 664\u001b[0m timeout \u001b[38;5;241m=\u001b[39m TimeoutSauce(connect\u001b[38;5;241m=\u001b[39mtimeout, read\u001b[38;5;241m=\u001b[39mtimeout)\n\u001b[1;32m 666\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 667\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 668\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 669\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 670\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 671\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 672\u001b[0m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 673\u001b[0m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 674\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 675\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 676\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 677\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 678\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 679\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 681\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m 682\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(err, request\u001b[38;5;241m=\u001b[39mrequest)\n",
24
+ "File \u001b[0;32m/usr/local/lib/python3.12/site-packages/urllib3/connectionpool.py:789\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[0m\n\u001b[1;32m 786\u001b[0m response_conn \u001b[38;5;241m=\u001b[39m conn \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m release_conn \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 788\u001b[0m \u001b[38;5;66;03m# Make the request on the HTTPConnection object\u001b[39;00m\n\u001b[0;32m--> 789\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 790\u001b[0m \u001b[43m \u001b[49m\u001b[43mconn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 791\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 792\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 793\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 794\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 795\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 796\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 797\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 798\u001b[0m \u001b[43m \u001b[49m\u001b[43mresponse_conn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresponse_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 799\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreload_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 800\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 801\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mresponse_kw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 802\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 804\u001b[0m \u001b[38;5;66;03m# Everything went great!\u001b[39;00m\n\u001b[1;32m 805\u001b[0m clean_exit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
25
+ "File \u001b[0;32m/usr/local/lib/python3.12/site-packages/urllib3/connectionpool.py:536\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[0m\n\u001b[1;32m 534\u001b[0m \u001b[38;5;66;03m# Receive the response from the server\u001b[39;00m\n\u001b[1;32m 535\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 536\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 537\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (BaseSSLError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 538\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_timeout(err\u001b[38;5;241m=\u001b[39me, url\u001b[38;5;241m=\u001b[39murl, timeout_value\u001b[38;5;241m=\u001b[39mread_timeout)\n",
26
+ "File \u001b[0;32m/usr/local/lib/python3.12/site-packages/urllib3/connection.py:507\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 504\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mresponse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HTTPResponse\n\u001b[1;32m 506\u001b[0m \u001b[38;5;66;03m# Get the response from http.client.HTTPConnection\u001b[39;00m\n\u001b[0;32m--> 507\u001b[0m httplib_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 509\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 510\u001b[0m assert_header_parsing(httplib_response\u001b[38;5;241m.\u001b[39mmsg)\n",
27
+ "File \u001b[0;32m/usr/local/lib/python3.12/http/client.py:1428\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1426\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1427\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1428\u001b[0m \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbegin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1429\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m:\n\u001b[1;32m 1430\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n",
28
+ "File \u001b[0;32m/usr/local/lib/python3.12/http/client.py:331\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 329\u001b[0m \u001b[38;5;66;03m# read until we get a non-100 response\u001b[39;00m\n\u001b[1;32m 330\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 331\u001b[0m version, status, reason \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_read_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 332\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status \u001b[38;5;241m!=\u001b[39m CONTINUE:\n\u001b[1;32m 333\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n",
29
+ "File \u001b[0;32m/usr/local/lib/python3.12/http/client.py:292\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_read_status\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 292\u001b[0m line \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_MAXLINE\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miso-8859-1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 293\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(line) \u001b[38;5;241m>\u001b[39m _MAXLINE:\n\u001b[1;32m 294\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m LineTooLong(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus line\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
30
+ "File \u001b[0;32m/usr/local/lib/python3.12/socket.py:720\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 720\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 721\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[1;32m 722\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
31
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
32
  ]
33
  }
34
  ],
project/Tools/QdrantTools.ipynb CHANGED
@@ -2,57 +2,17 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {},
7
  "outputs": [
8
  {
9
  "name": "stdout",
10
  "output_type": "stream",
11
  "text": [
12
- "id=0 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 0, 'text': 'ROS: Home Why ROS? Getting Started Community Ecosystem ROS - Robot Operating System The Robot Operating System (ROS) is a set of software libraries and tools that help you build robot applications. From drivers to state-of-the-art algorithms, and with powerful developer tools, ROS has what you need for your next robotics project. And it\\'s all open source. What is ROS? ROS Videos \" Install Jazzy Jalisco Jazzy Jalisco is our latest ROS 2 LTS release targeted at the Ubuntu 24.04 (Noble) and'} vector=None shard_key=None order_value=None\n",
13
- "id=1 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 1, 'text': '24.04 (Noble) and Windows 10, though other systems are supported to varying degrees. Learn More Humble Hawksbill ROS 2 Humble Hawksbill is a slighly older LTS release of ROS 2 targeted at Ubuntu 22.04 (Jammy) and Windows 10. Other systems are supported including tier 3 support for 20.04 for those transitioning from ROS 1. Learn More Support There are several mechanisms in place to support the ROS community, each with its own purpose. Documentation Documentation and tutorials for ROS 2 Stack'} vector=None shard_key=None order_value=None\n",
14
- "id=2 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 2, 'text': 'for ROS 2 Stack Exchange Ask questions. Get answers. Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Documentation Documentation and tutorials for ROS 2 Robotics Stack Exchange Ask questions.Get answers.All ROS versions Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Recent Updates and Highlights ROSCon 2024 Videos are Now Available See the ROSCon 2024 website for details 11/18/2024 - Katherine Scott The'} vector=None shard_key=None order_value=None\n",
15
- "id=3 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 3, 'text': 'Katherine Scott The videos from ROSCon 2024 in Odense are now available on the ROSCon Website (see the program), this Vimeo showcase, and in the ROS documentation. The ROSCon website also includes the slides from all the talks at ROSCon. I have also included a list of all the videos below. I want to thank AMD for being our 2024 ROSCon video sponsor, their generous support makes the ROSCon live stream and videos possible. READ MORE Recent ROS Discourse Posts ROS News of the Week 11/22/2024 - ROS'} vector=None shard_key=None order_value=None\n",
16
- "id=4 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 4, 'text': '11/22/2024 - ROS Discourse Gazebo Classic and Citadel End of Life 12/2/2024 - ROS Discourse ROS 2 driver for Mitsubishi Melfa RV-FR 10/24/2024 ROS Discourse Home Why ROS? Getting Started Community Ecosystem Q&A Forum Packages Wiki Documentation media Q&A Forum Packages ROSCon Wiki documentation discord Brought to you by Open Robotics | licensed under Creative Commons Attributions 3.0 | 2021 Open Robotics'} vector=None shard_key=None order_value=None\n",
17
- "Number of document chunks: 5\n",
18
  "\n",
19
- "Sample document chunk(metadata not the vector): \n",
20
- "id=0 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 0, 'text': 'ROS: Home Why ROS? Getting Started Community Ecosystem ROS - Robot Operating System The Robot Operating System (ROS) is a set of software libraries and tools that help you build robot applications. From drivers to state-of-the-art algorithms, and with powerful developer tools, ROS has what you need for your next robotics project. And it\\'s all open source. What is ROS? ROS Videos \" Install Jazzy Jalisco Jazzy Jalisco is our latest ROS 2 LTS release targeted at the Ubuntu 24.04 (Noble) and'} vector=None shard_key=None order_value=None \n",
21
- "\n",
22
- "id=0 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 0, 'text': \"#About TheRobotOperatingSystem(ROS)isasetofsoftwarelibrariesandtoolsthathelpyoubuildrobotapplications. Fromdriverstostate-of-the-artalgorithms,andwithpowerfuldevelopertools,ROShaswhatyouneedforyournextroboticsproject. Andit'sallopensource. Fullprojectdetailson[ROS.org](https://ros.org/) #GettingStarted LookingtogetstartedwithROS? Our[installationguideishere](https://www.ros.org/blog/getting-started/).\"} vector=None shard_key=None order_value=None\n",
23
- "id=1 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 1, 'text': \"Onceyou'veinstalledROSstartbylearningsome[basicconcepts](https://docs.ros.org/en/rolling/Concepts/Basic.html)andtakealookatour[beginnertutorials](https://docs.ros.org/en/rolling/Tutorials/Beginner-CLI-Tools.html). #JointheROSCommunity ##CommunityResources *[ROSDiscussionForum](https://discourse.ros.org/) *[ROSDiscordServer](https://discord.com/servers/open-robotics-1077825543698927656) *[RoboticsStackExchange](https://robotics.stackexchange.com/)(preferredROSsupportforum).\"} vector=None shard_key=None order_value=None\n",
24
- "id=2 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 2, 'text': '*[OfficialROSVideos](https://vimeo.com/osrfoundation) *[ROSCon](https://roscon.ros.org),ouryearlydeveloperconference. *CiteROS2inacademicworkusing[DOI:10.1126/scirobotics.abm6074](https://www.science.org/doi/10.1126/scirobotics.abm6074) ##DeveloperResources *[ROS2Documentation](https://docs.ros.org/) *[ROSPackageAPIreference](https://docs.ros.org/en/rolling/p/) *[ROSPackageIndex](https://index.ros.org/) *[ROSonDockerHub](https://hub.docker.com/_/ros/)'} vector=None shard_key=None order_value=None\n",
25
- "id=3 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 3, 'text': '*[ROSResourceStatusPage](https://status.openrobotics.org/) *[REP-2000](https://ros.org/reps/rep-2000.html):ROS2ReleasesandTargetPlatforms ##ProjectResources *[PurchaseROSSwag](https://spring.ros.org/) *[InformationabouttheROSTrademark](https://www.ros.org/blog/media/) *OnSocialMedia *[OpenRoboticsonLinkedIn](https://www.linkedin.com/company/open-source-robotics-foundation) *[OpenRoboticsonTwitter](https://twitter.com/OpenRoboticsOrg) *[ROS.orgonTwitter](https://twitter.com/ROSOrg)'} vector=None shard_key=None order_value=None\n",
26
- "id=4 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 4, 'text': 'ROSismadepossiblethroughthegeneroussupportofopensourcecontributorsandthenon-profit[OpenSourceRoboticsFoundation(OSRF)](https://www.openrobotics.org/). TaxdeductibledonationstotheOSRFcanbe[madehere.](https://donorbox.org/support-open-robotics?utm_medium=qrcode&utm_source=qrcode)'} vector=None shard_key=None order_value=None\n",
27
- "id=5 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/.gitignore', 'type': 'Github', 'chunk': 0, 'text': '#Ignoredefaultnamesforcolconcreatedfolders build install log #Ignoreeverythinginsrcexcepta.gitkeepfile src/* !src/.gitkeep'} vector=None shard_key=None order_value=None\n",
28
- "id=6 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/CODEOWNERS', 'type': 'Github', 'chunk': 0, 'text': '#Thisfilewasgeneratedbyhttps://github.com/audrow/update-ros2-repos *@clalancette@codebot'} vector=None shard_key=None order_value=None\n",
29
- "id=7 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'type': 'Github', 'chunk': 0, 'text': 'repositories: ament/ament_cmake: type:git url:https://github.com/ament/ament_cmake.git version:rolling ament/ament_index: type:git url:https://github.com/ament/ament_index.git version:rolling ament/ament_lint: type:git url:https://github.com/ament/ament_lint.git version:rolling ament/ament_package: type:git url:https://github.com/ament/ament_package.git version:rolling ament/google_benchmark_vendor: type:git url:https://github.com/ament/google_benchmark_vendor.git version:rolling'} vector=None shard_key=None order_value=None\n",
30
- "id=8 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'type': 'Github', 'chunk': 1, 'text': 'version:rolling ament/googletest: type:git url:https://github.com/ament/googletest.git version:rolling ament/uncrustify_vendor: type:git url:https://github.com/ament/uncrustify_vendor.git version:rolling eProsima/Fast-CDR: type:git url:https://github.com/eProsima/Fast-CDR.git version:2.2.x eProsima/Fast-DDS: type:git url:https://github.com/eProsima/Fast-DDS.git version:2.14.x eProsima/foonathan_memory_vendor: type:git url:https://github.com/eProsima/foonathan_memory_vendor.git version:master'} vector=None shard_key=None order_value=None\n",
31
- "id=9 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'type': 'Github', 'chunk': 2, 'text': 'version:master eclipse-cyclonedds/cyclonedds: type:git url:https://github.com/eclipse-cyclonedds/cyclonedds.git version:releases/0.10.x eclipse-iceoryx/iceoryx: type:git url:https://github.com/eclipse-iceoryx/iceoryx.git version:release_2.0 gazebo-release/gz_cmake_vendor: type:git url:https://github.com/gazebo-release/gz_cmake_vendor.git version:rolling gazebo-release/gz_math_vendor: type:git url:https://github.com/gazebo-release/gz_math_vendor.git version:rolling'} vector=None shard_key=None order_value=None\n",
32
- "\n",
33
- "Number of Github chunks: 10\n",
34
- "\n",
35
- "Sample Github chunk(metadata not the vector): \n",
36
- "id=0 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 0, 'text': \"#About TheRobotOperatingSystem(ROS)isasetofsoftwarelibrariesandtoolsthathelpyoubuildrobotapplications. Fromdriverstostate-of-the-artalgorithms,andwithpowerfuldevelopertools,ROShaswhatyouneedforyournextroboticsproject. Andit'sallopensource. Fullprojectdetailson[ROS.org](https://ros.org/) #GettingStarted LookingtogetstartedwithROS? Our[installationguideishere](https://www.ros.org/blog/getting-started/).\"} vector=None shard_key=None order_value=None \n",
37
- "\n"
38
- ]
39
- },
40
- {
41
- "name": "stderr",
42
- "output_type": "stream",
43
- "text": [
44
- "/workspaces/RAG_LLM/project/Tools/shared.py:57: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the :class:`~langchain-ollama package and should be used instead. To use it run `pip install -U :class:`~langchain-ollama` and import as `from :class:`~langchain_ollama import OllamaEmbeddings``.\n",
45
- " return OllamaEmbeddings(model=MODEL, base_url=\"http://host.docker.internal:11434\")\n"
46
- ]
47
- },
48
- {
49
- "name": "stdout",
50
- "output_type": "stream",
51
- "text": [
52
- "\n",
53
- "Sample search result(n=2): \n",
54
- "id=4 version=4 score=0.38799083 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 4, 'text': '11/22/2024 - ROS Discourse Gazebo Classic and Citadel End of Life 12/2/2024 - ROS Discourse ROS 2 driver for Mitsubishi Melfa RV-FR 10/24/2024 ROS Discourse Home Why ROS? Getting Started Community Ecosystem Q&A Forum Packages Wiki Documentation media Q&A Forum Packages ROSCon Wiki documentation discord Brought to you by Open Robotics | licensed under Creative Commons Attributions 3.0 | 2021 Open Robotics'} vector=None shard_key=None order_value=None\n",
55
- "id=2 version=2 score=0.35047314 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 2, 'text': 'for ROS 2 Stack Exchange Ask questions. Get answers. Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Documentation Documentation and tutorials for ROS 2 Robotics Stack Exchange Ask questions.Get answers.All ROS versions Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Recent Updates and Highlights ROSCon 2024 Videos are Now Available See the ROSCon 2024 website for details 11/18/2024 - Katherine Scott The'} vector=None shard_key=None order_value=None\n"
56
  ]
57
  }
58
  ],
@@ -63,14 +23,15 @@
63
  "# Show everything in the Document collection\n",
64
  "numDocumentChunks = 0\n",
65
  "# Note with_vectors defaults to false, so the vectors are not returned\n",
66
- "chunks = qClient.scroll(collection_name='Document')\n",
67
- "#print(chunks)\n",
68
- "for chunk in chunks[0]:\n",
69
- " # Only display chunks if vector database is small\n",
70
- " print(chunk)\n",
71
- " if numDocumentChunks == 0:\n",
72
- " sampleDocumentChunk = chunk\n",
73
- " numDocumentChunks += 1\n",
 
74
  "print(\"Number of document chunks: \", numDocumentChunks)\n",
75
  "if numDocumentChunks > 0:\n",
76
  " print(\"\\nSample document chunk(metadata not the vector): \")\n",
@@ -78,17 +39,19 @@
78
  "\n",
79
  "# Show everything in the Github collection\n",
80
  "numGithubChunks = 0\n",
81
- "chunks = qClient.scroll(collection_name='Github')\n",
82
- "#print(chunks)\n",
83
- "for chunk in chunks[0]:\n",
84
- " # Only display chunks if vector database is small\n",
85
- " print(chunk)\n",
86
- " if numGithubChunks == 0:\n",
87
- " sampleGithubChunk = chunk\n",
88
- " numGithubChunks += 1\n",
89
- "print(\"\\nNumber of Github chunks: \", numGithubChunks)\n",
 
 
90
  "if numGithubChunks > 0:\n",
91
- " print(\"\\nSample Github chunk(metadata not the vector): \")\n",
92
  " print(sampleGithubChunk, '\\n')\n",
93
  "\n",
94
  "# Show a sample search\n",
@@ -96,7 +59,7 @@
96
  "results = qClient.search(\n",
97
  " collection_name=\"Document\",\n",
98
  " query_vector = embeddingsModel.embed_query(\"What operating system is ROS made for?\"),\n",
99
- " limit=2\n",
100
  ")\n",
101
  "print(\"\\nSample search result(n=2): \")\n",
102
  "for result in results:\n",
@@ -105,7 +68,7 @@
105
  },
106
  {
107
  "cell_type": "code",
108
- "execution_count": 12,
109
  "metadata": {},
110
  "outputs": [
111
  {
@@ -137,7 +100,7 @@
137
  },
138
  {
139
  "cell_type": "code",
140
- "execution_count": 20,
141
  "metadata": {},
142
  "outputs": [
143
  {
@@ -146,7 +109,7 @@
146
  "True"
147
  ]
148
  },
149
- "execution_count": 20,
150
  "metadata": {},
151
  "output_type": "execute_result"
152
  }
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 4,
6
  "metadata": {},
7
  "outputs": [
8
  {
9
  "name": "stdout",
10
  "output_type": "stream",
11
  "text": [
12
+ "Number of document chunks: 0\n",
13
+ "Number of githb chunks: 0\n",
 
 
 
 
14
  "\n",
15
+ "Sample search result(n=2): \n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  ]
17
  }
18
  ],
 
23
  "# Show everything in the Document collection\n",
24
  "numDocumentChunks = 0\n",
25
  "# Note with_vectors defaults to false, so the vectors are not returned\n",
26
+ "chunks = qClient.scroll(collection_name='Document', limit=100)\n",
27
+ "while True:\n",
28
+ " for chunk in chunks[0]:\n",
29
+ " if numDocumentChunks == 0:\n",
30
+ " sampleDocumentChunk = chunk\n",
31
+ " numDocumentChunks += 1\n",
32
+ " chunks = qClient.scroll(collection_name='Document', limit=100, with_payload=False, offset=chunks[1])\n",
33
+ " if chunks[1] is None:\n",
34
+ " break\n",
35
  "print(\"Number of document chunks: \", numDocumentChunks)\n",
36
  "if numDocumentChunks > 0:\n",
37
  " print(\"\\nSample document chunk(metadata not the vector): \")\n",
 
39
  "\n",
40
  "# Show everything in the Github collection\n",
41
  "numGithubChunks = 0\n",
42
+ "# Note with_vectors defaults to false, so the vectors are not returned\n",
43
+ "chunks = qClient.scroll(collection_name='Github', limit=100)\n",
44
+ "while True:\n",
45
+ " for chunk in chunks[0]:\n",
46
+ " if numGithubChunks == 0:\n",
47
+ " sampleGithubChunk = chunk\n",
48
+ " numGithubChunks += 1\n",
49
+ " chunks = qClient.scroll(collection_name='Github', limit=100, with_payload=False, offset=chunks[1])\n",
50
+ " if chunks[1] is None:\n",
51
+ " break\n",
52
+ "print(\"Number of githb chunks: \", numDocumentChunks)\n",
53
  "if numGithubChunks > 0:\n",
54
+ " print(\"\\nSample github chunk(metadata not the vector): \")\n",
55
  " print(sampleGithubChunk, '\\n')\n",
56
  "\n",
57
  "# Show a sample search\n",
 
59
  "results = qClient.search(\n",
60
  " collection_name=\"Document\",\n",
61
  " query_vector = embeddingsModel.embed_query(\"What operating system is ROS made for?\"),\n",
62
+ " limit=10\n",
63
  ")\n",
64
  "print(\"\\nSample search result(n=2): \")\n",
65
  "for result in results:\n",
 
68
  },
69
  {
70
  "cell_type": "code",
71
+ "execution_count": 22,
72
  "metadata": {},
73
  "outputs": [
74
  {
 
100
  },
101
  {
102
  "cell_type": "code",
103
+ "execution_count": 3,
104
  "metadata": {},
105
  "outputs": [
106
  {
 
109
  "True"
110
  ]
111
  },
112
+ "execution_count": 3,
113
  "metadata": {},
114
  "output_type": "execute_result"
115
  }
project/Tools/mongoTools.ipynb CHANGED
The diff for this file is too large to render. See raw diff