Commit
·
4779f10
1
Parent(s):
50e2c8d
Data collection improvement
Browse files- .gitattributes +35 -35
- README.md +6 -6
- project/.gitignore +3 -1
- project/ClearML/DataCollectionPipeline.py +36 -28
- project/ClearML/FeaturePipeline.py +1 -1
- project/ClearML/InferencePipeline.py +1 -1
- project/DataCollectionPipeline.ipynb +70 -54
- project/FeaturePipeline.ipynb +24 -6
- project/Tools/QdrantTools.ipynb +29 -66
- project/Tools/mongoTools.ipynb +0 -0
.gitattributes
CHANGED
@@ -1,35 +1,35 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
-
---
|
2 |
-
language:
|
3 |
-
- en
|
4 |
-
base_model:
|
5 |
-
- meta-llama/Llama-3.2-3B-Instruct
|
6 |
-
---
|
7 |
This model enhances the performance of the ollama3.2 model in the ros2, nav2, movit2, and gazebo subdomains through RAG. See the project README.md file for more information on how to setup/run the model and how it works.
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- en
|
4 |
+
base_model:
|
5 |
+
- meta-llama/Llama-3.2-3B-Instruct
|
6 |
+
---
|
7 |
This model enhances the performance of the ollama3.2 model in the ros2, nav2, movit2, and gazebo subdomains through RAG. See the project README.md file for more information on how to setup/run the model and how it works.
|
project/.gitignore
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
.gradio
|
2 |
-
.env
|
|
|
|
|
|
1 |
.gradio
|
2 |
+
.env
|
3 |
+
__pycache__
|
4 |
+
/tool/__pycache__
|
project/ClearML/DataCollectionPipeline.py
CHANGED
@@ -12,6 +12,7 @@ import pymongo
|
|
12 |
import requests
|
13 |
from bs4 import BeautifulSoup
|
14 |
from clearml import PipelineDecorator
|
|
|
15 |
from dotenv import load_dotenv
|
16 |
|
17 |
# Setup ClearML
|
@@ -26,6 +27,7 @@ CLEARML_API_ACCESS_KEY = os.getenv("CLEARML_API_ACCESS_KEY")
|
|
26 |
CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
|
27 |
|
28 |
# Input into the Data Collection Pipeline is a list of links to domains
|
|
|
29 |
links = [
|
30 |
"https://www.ros.org/",
|
31 |
"https://docs.nav2.org/",
|
@@ -36,7 +38,8 @@ links = [
|
|
36 |
"https://github.com/moveit/moveit2",
|
37 |
"https://github.com/gazebosim/gazebo-classic",
|
38 |
]
|
39 |
-
|
|
|
40 |
|
41 |
|
42 |
# ETL pipeline
|
@@ -123,33 +126,38 @@ def ETL_Pipeline(links):
|
|
123 |
mongoCollection = mongoDatabase["Document"]
|
124 |
result = mongoCollection.find_one({"link": link})
|
125 |
if result is None:
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
151 |
# Avoid spamming sites
|
152 |
-
time.sleep(1)
|
153 |
# Each document has a link, type(github or other), and content(text)
|
154 |
mongoCollection = mongoDatabase["Document"]
|
155 |
mongoCollection.insert_many(documents)
|
@@ -162,7 +170,7 @@ def ETL_Pipeline(links):
|
|
162 |
@PipelineDecorator.pipeline(
|
163 |
name="Data Collection Pipeline",
|
164 |
project="RAG LLM",
|
165 |
-
version="0.
|
166 |
)
|
167 |
def main():
|
168 |
return ETL_Pipeline(links)
|
|
|
12 |
import requests
|
13 |
from bs4 import BeautifulSoup
|
14 |
from clearml import PipelineDecorator
|
15 |
+
import urllib.parse
|
16 |
from dotenv import load_dotenv
|
17 |
|
18 |
# Setup ClearML
|
|
|
27 |
CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")
|
28 |
|
29 |
# Input into the Data Collection Pipeline is a list of links to domains
|
30 |
+
"""
|
31 |
links = [
|
32 |
"https://www.ros.org/",
|
33 |
"https://docs.nav2.org/",
|
|
|
38 |
"https://github.com/moveit/moveit2",
|
39 |
"https://github.com/gazebosim/gazebo-classic",
|
40 |
]
|
41 |
+
"""
|
42 |
+
links = [ "https://www.ros.org/", "https://github.com/ros2/ros2" ]
|
43 |
|
44 |
|
45 |
# ETL pipeline
|
|
|
126 |
mongoCollection = mongoDatabase["Document"]
|
127 |
result = mongoCollection.find_one({"link": link})
|
128 |
if result is None:
|
129 |
+
try:
|
130 |
+
# Get all text in the website
|
131 |
+
r = requests.get(link)
|
132 |
+
soup = BeautifulSoup(r.content, "html.parser")
|
133 |
+
soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"])
|
134 |
+
text = soup.get_text()
|
135 |
+
# Transform the data
|
136 |
+
# Get rid of repeating \n characters and spaces
|
137 |
+
text = text.replace("\t", " ")
|
138 |
+
text = text.replace("\n", " ")
|
139 |
+
text_len = len(text)
|
140 |
+
for i in range(text_len):
|
141 |
+
while i + 1 < text_len and text[i] == " " and text[i + 1] == " ":
|
142 |
+
text = text[:i] + text[i + 1 :]
|
143 |
+
text_len -= 1
|
144 |
+
if "404" not in text:
|
145 |
+
documents.append({"link": link, "type": "Document", "content": text})
|
146 |
+
# Also crawl through all subdirectorys in the link(related links)
|
147 |
+
soup = BeautifulSoup(r.content, "html.parser")
|
148 |
+
subdirectories = [a.get("href") for a in soup.find_all("a")]
|
149 |
+
for subdirectory in subdirectories:
|
150 |
+
newLink = urllib.parse.urljoin(link, subdirectory)
|
151 |
+
if (
|
152 |
+
subdirectory is not None and
|
153 |
+
'http' not in subdirectory and
|
154 |
+
mongoCollection.find_one({"link": newLink}) is None
|
155 |
+
):
|
156 |
+
links.append(newLink)
|
157 |
+
except:
|
158 |
+
print("Could not crawl link", link)
|
159 |
# Avoid spamming sites
|
160 |
+
time.sleep(0.1)
|
161 |
# Each document has a link, type(github or other), and content(text)
|
162 |
mongoCollection = mongoDatabase["Document"]
|
163 |
mongoCollection.insert_many(documents)
|
|
|
170 |
@PipelineDecorator.pipeline(
|
171 |
name="Data Collection Pipeline",
|
172 |
project="RAG LLM",
|
173 |
+
version="0.4",
|
174 |
)
|
175 |
def main():
|
176 |
return ETL_Pipeline(links)
|
project/ClearML/FeaturePipeline.py
CHANGED
@@ -153,7 +153,7 @@ def storeEmbeddings(embeddings, links, resultTypes, chunks, chunkNums):
|
|
153 |
@PipelineDecorator.pipeline(
|
154 |
name="Feature Pipeline",
|
155 |
project="RAG LLM",
|
156 |
-
version="0.
|
157 |
)
|
158 |
def main():
|
159 |
links, resultTypes, texts = retreiveDocuments()
|
|
|
153 |
@PipelineDecorator.pipeline(
|
154 |
name="Feature Pipeline",
|
155 |
project="RAG LLM",
|
156 |
+
version="0.3",
|
157 |
)
|
158 |
def main():
|
159 |
links, resultTypes, texts = retreiveDocuments()
|
project/ClearML/InferencePipeline.py
CHANGED
@@ -143,7 +143,7 @@ def reranking(results):
|
|
143 |
texts = [result.payload["text"] for result in results]
|
144 |
topTexts = ""
|
145 |
for index in topIndexes:
|
146 |
-
topTexts += texts[index]
|
147 |
return topTexts
|
148 |
|
149 |
|
|
|
143 |
texts = [result.payload["text"] for result in results]
|
144 |
topTexts = ""
|
145 |
for index in topIndexes:
|
146 |
+
topTexts += texts[index]
|
147 |
return topTexts
|
148 |
|
149 |
|
project/DataCollectionPipeline.ipynb
CHANGED
@@ -2,13 +2,17 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
8 |
{
|
9 |
"name": "stdout",
|
10 |
"output_type": "stream",
|
11 |
"text": [
|
|
|
|
|
|
|
|
|
12 |
"Visiting link: https://github.com/ros2/ros2\n"
|
13 |
]
|
14 |
},
|
@@ -23,22 +27,32 @@
|
|
23 |
"name": "stdout",
|
24 |
"output_type": "stream",
|
25 |
"text": [
|
26 |
-
"
|
27 |
-
"Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/.gitignore\n",
|
28 |
-
"Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/CODEOWNERS\n",
|
29 |
-
"Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/ros2.repos\n",
|
30 |
-
"Adding subdirectory: https://github.com/ros2/ros2/tree/rolling/src/.gitkeep\n"
|
31 |
]
|
32 |
},
|
33 |
{
|
34 |
-
"
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
}
|
43 |
],
|
44 |
"source": [
|
@@ -52,26 +66,24 @@
|
|
52 |
"import shutil\n",
|
53 |
"import subprocess\n",
|
54 |
"import tempfile\n",
|
|
|
55 |
"from shared import getMongoClient\n",
|
56 |
"\n",
|
57 |
"# Input into the Data Collection Pipeline is a list of links to domains\n",
|
58 |
"links = ['https://www.ros.org/','https://docs.nav2.org/','https://moveit.ai/','https://gazebosim.org/home', 'https://github.com/ros2/ros2', 'https://github.com/ros-navigation/navigation2', 'https://github.com/moveit/moveit2', 'https://github.com/gazebosim/gazebo-classic']\n",
|
59 |
-
"links = ['https://www.ros.org/', 'https://github.com/ros2/ros2']\n",
|
60 |
"\n",
|
61 |
"# Create a mongoDB connection\n",
|
62 |
"mongoHost = getMongoClient()\n",
|
63 |
"mongoDatabase = mongoHost[\"twin\"]\n",
|
64 |
-
"\n",
|
65 |
"# ETL pipeline\n",
|
66 |
"# Extract data from links and their subdirectories(using crawlers)\n",
|
67 |
-
"documents = []\n",
|
68 |
"codes = []\n",
|
69 |
"for link in links:\n",
|
70 |
" # Web scraper/crawler for github links\n",
|
71 |
" if \"https://github.com\" in link:\n",
|
72 |
" # Do not revisit a link already in the database\n",
|
73 |
" mongoCollection = mongoDatabase[\"Github\"]\n",
|
74 |
-
" result = mongoCollection.find_one({\"link\": link})\n",
|
75 |
" if result is None:\n",
|
76 |
" print(\"Visiting link: \", link)\n",
|
77 |
" # Modified GithubCrawler from LLM-Engineer for scraping github\n",
|
@@ -106,9 +118,9 @@
|
|
106 |
" path = path.rsplit(\"/\", 1)[0]\n",
|
107 |
" # Push all the subdirectories to mongo\n",
|
108 |
" for subdirectory in tree:\n",
|
109 |
-
" print(\n",
|
110 |
-
"
|
111 |
-
" )\n",
|
112 |
" text = tree[subdirectory]\n",
|
113 |
" # Transform the data\n",
|
114 |
" # Get rid of repeating \\n characters and spaces\n",
|
@@ -129,6 +141,8 @@
|
|
129 |
" \"content\": text,\n",
|
130 |
" }\n",
|
131 |
" )\n",
|
|
|
|
|
132 |
" else:\n",
|
133 |
" print(\"Already visited: \", link)\n",
|
134 |
" # Web scraper/crawler for other links(Documents)\n",
|
@@ -137,42 +151,44 @@
|
|
137 |
" mongoCollection = mongoDatabase[\"Document\"]\n",
|
138 |
" result = mongoCollection.find_one({\"link\": link})\n",
|
139 |
" if result is None:\n",
|
140 |
-
"
|
141 |
-
"
|
142 |
-
"
|
143 |
-
"
|
144 |
-
"
|
145 |
-
"
|
146 |
-
"
|
147 |
-
"
|
148 |
-
"
|
149 |
-
"
|
150 |
-
"
|
151 |
-
"
|
152 |
-
"
|
153 |
-
" text_len
|
154 |
-
"
|
155 |
-
"
|
156 |
-
"
|
157 |
-
"
|
158 |
-
"
|
159 |
-
"
|
160 |
-
"
|
161 |
-
"
|
162 |
-
"
|
163 |
-
"
|
164 |
-
"
|
165 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
" else:\n",
|
167 |
" print(\"Already visited: \", link)\n",
|
168 |
" # Avoid spamming sites\n",
|
169 |
-
" time.sleep(1)
|
170 |
-
"# Each document has a link, type(github or other), and content(text)\n",
|
171 |
-
"# You can go to Tools/mongoTools to view the inserted documents\n",
|
172 |
-
"mongoCollection = mongoDatabase[\"Document\"]\n",
|
173 |
-
"mongoCollection.insert_many(documents)\n",
|
174 |
-
"mongoCollection = mongoDatabase[\"Github\"]\n",
|
175 |
-
"mongoCollection.insert_many(codes)"
|
176 |
]
|
177 |
}
|
178 |
],
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 10,
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
8 |
{
|
9 |
"name": "stdout",
|
10 |
"output_type": "stream",
|
11 |
"text": [
|
12 |
+
"Visiting link: https://www.ros.org/\n",
|
13 |
+
"Visiting link: https://docs.nav2.org/\n",
|
14 |
+
"Visiting link: https://moveit.ai/\n",
|
15 |
+
"Visiting link: https://gazebosim.org/home\n",
|
16 |
"Visiting link: https://github.com/ros2/ros2\n"
|
17 |
]
|
18 |
},
|
|
|
27 |
"name": "stdout",
|
28 |
"output_type": "stream",
|
29 |
"text": [
|
30 |
+
"Visiting link: https://github.com/ros-navigation/navigation2\n"
|
|
|
|
|
|
|
|
|
31 |
]
|
32 |
},
|
33 |
{
|
34 |
+
"name": "stderr",
|
35 |
+
"output_type": "stream",
|
36 |
+
"text": [
|
37 |
+
"Cloning into 'navigation2'...\n",
|
38 |
+
"fatal: early EOF\n"
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"ename": "KeyboardInterrupt",
|
43 |
+
"evalue": "",
|
44 |
+
"output_type": "error",
|
45 |
+
"traceback": [
|
46 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
47 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
48 |
+
"Cell \u001b[0;32mIn[10], line 35\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 34\u001b[0m os\u001b[38;5;241m.\u001b[39mchdir(local_temp)\n\u001b[0;32m---> 35\u001b[0m \u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgit\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mclone\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlink\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 36\u001b[0m repo_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(local_temp, os\u001b[38;5;241m.\u001b[39mlistdir(local_temp)[\u001b[38;5;241m0\u001b[39m])\n\u001b[1;32m 37\u001b[0m tree \u001b[38;5;241m=\u001b[39m {}\n",
|
49 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:550\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m 548\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m Popen(\u001b[38;5;241m*\u001b[39mpopenargs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;28;01mas\u001b[39;00m process:\n\u001b[1;32m 549\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 550\u001b[0m stdout, stderr \u001b[38;5;241m=\u001b[39m \u001b[43mprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcommunicate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 551\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m TimeoutExpired \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 552\u001b[0m process\u001b[38;5;241m.\u001b[39mkill()\n",
|
50 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:1201\u001b[0m, in \u001b[0;36mPopen.communicate\u001b[0;34m(self, input, timeout)\u001b[0m\n\u001b[1;32m 1199\u001b[0m stderr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m 1200\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m-> 1201\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1202\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
|
51 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:1264\u001b[0m, in \u001b[0;36mPopen.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 1262\u001b[0m endtime \u001b[38;5;241m=\u001b[39m _time() \u001b[38;5;241m+\u001b[39m timeout\n\u001b[1;32m 1263\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1264\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_wait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1265\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m 1266\u001b[0m \u001b[38;5;66;03m# https://bugs.python.org/issue25942\u001b[39;00m\n\u001b[1;32m 1267\u001b[0m \u001b[38;5;66;03m# The first keyboard interrupt waits briefly for the child to\u001b[39;00m\n\u001b[1;32m 1268\u001b[0m \u001b[38;5;66;03m# exit under the common assumption that it also received the ^C\u001b[39;00m\n\u001b[1;32m 1269\u001b[0m \u001b[38;5;66;03m# generated SIGINT and will exit rapidly.\u001b[39;00m\n\u001b[1;32m 1270\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
|
52 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:2053\u001b[0m, in \u001b[0;36mPopen._wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 2051\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreturncode \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 2052\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m \u001b[38;5;66;03m# Another thread waited.\u001b[39;00m\n\u001b[0;32m-> 2053\u001b[0m (pid, sts) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_try_wait\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2054\u001b[0m \u001b[38;5;66;03m# Check the pid and loop as waitpid has been known to\u001b[39;00m\n\u001b[1;32m 2055\u001b[0m \u001b[38;5;66;03m# return 0 even without WNOHANG in odd situations.\u001b[39;00m\n\u001b[1;32m 2056\u001b[0m \u001b[38;5;66;03m# http://bugs.python.org/issue14396.\u001b[39;00m\n\u001b[1;32m 2057\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m pid \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpid:\n",
|
53 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/subprocess.py:2011\u001b[0m, in \u001b[0;36mPopen._try_wait\u001b[0;34m(self, wait_flags)\u001b[0m\n\u001b[1;32m 2009\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"All callers to this function MUST hold self._waitpid_lock.\"\"\"\u001b[39;00m\n\u001b[1;32m 2010\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 2011\u001b[0m (pid, sts) \u001b[38;5;241m=\u001b[39m \u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwaitpid\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwait_flags\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2012\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mChildProcessError\u001b[39;00m:\n\u001b[1;32m 2013\u001b[0m \u001b[38;5;66;03m# This happens if SIGCLD is set to be ignored or waiting\u001b[39;00m\n\u001b[1;32m 2014\u001b[0m \u001b[38;5;66;03m# for child processes has otherwise been disabled for our\u001b[39;00m\n\u001b[1;32m 2015\u001b[0m \u001b[38;5;66;03m# process. This child is dead, we can't get the status.\u001b[39;00m\n\u001b[1;32m 2016\u001b[0m pid \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpid\n",
|
54 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
55 |
+
]
|
56 |
}
|
57 |
],
|
58 |
"source": [
|
|
|
66 |
"import shutil\n",
|
67 |
"import subprocess\n",
|
68 |
"import tempfile\n",
|
69 |
+
"import urllib.parse\n",
|
70 |
"from shared import getMongoClient\n",
|
71 |
"\n",
|
72 |
"# Input into the Data Collection Pipeline is a list of links to domains\n",
|
73 |
"links = ['https://www.ros.org/','https://docs.nav2.org/','https://moveit.ai/','https://gazebosim.org/home', 'https://github.com/ros2/ros2', 'https://github.com/ros-navigation/navigation2', 'https://github.com/moveit/moveit2', 'https://github.com/gazebosim/gazebo-classic']\n",
|
|
|
74 |
"\n",
|
75 |
"# Create a mongoDB connection\n",
|
76 |
"mongoHost = getMongoClient()\n",
|
77 |
"mongoDatabase = mongoHost[\"twin\"]\n",
|
|
|
78 |
"# ETL pipeline\n",
|
79 |
"# Extract data from links and their subdirectories(using crawlers)\n",
|
|
|
80 |
"codes = []\n",
|
81 |
"for link in links:\n",
|
82 |
" # Web scraper/crawler for github links\n",
|
83 |
" if \"https://github.com\" in link:\n",
|
84 |
" # Do not revisit a link already in the database\n",
|
85 |
" mongoCollection = mongoDatabase[\"Github\"]\n",
|
86 |
+
" result = mongoCollection.find_one({\"link\": {\"$regex\" : link}})\n",
|
87 |
" if result is None:\n",
|
88 |
" print(\"Visiting link: \", link)\n",
|
89 |
" # Modified GithubCrawler from LLM-Engineer for scraping github\n",
|
|
|
118 |
" path = path.rsplit(\"/\", 1)[0]\n",
|
119 |
" # Push all the subdirectories to mongo\n",
|
120 |
" for subdirectory in tree:\n",
|
121 |
+
" #print(\n",
|
122 |
+
" # f\"Adding subdirectory: https://github.com{path}/{subdirectory}\"\n",
|
123 |
+
" #)\n",
|
124 |
" text = tree[subdirectory]\n",
|
125 |
" # Transform the data\n",
|
126 |
" # Get rid of repeating \\n characters and spaces\n",
|
|
|
141 |
" \"content\": text,\n",
|
142 |
" }\n",
|
143 |
" )\n",
|
144 |
+
" mongoCollection.insert_many(codes)\n",
|
145 |
+
" codes = []\n",
|
146 |
" else:\n",
|
147 |
" print(\"Already visited: \", link)\n",
|
148 |
" # Web scraper/crawler for other links(Documents)\n",
|
|
|
151 |
" mongoCollection = mongoDatabase[\"Document\"]\n",
|
152 |
" result = mongoCollection.find_one({\"link\": link})\n",
|
153 |
" if result is None:\n",
|
154 |
+
" print(\"Visiting link: \", link)\n",
|
155 |
+
" try:\n",
|
156 |
+
" # Get all text in the website\n",
|
157 |
+
" r = requests.get(link)\n",
|
158 |
+
" soup = BeautifulSoup(r.content, \"html.parser\")\n",
|
159 |
+
" soup.find_all([\"p\", \"h1\", \"h2\", \"h3\", \"h4\", \"h5\", \"h6\"])\n",
|
160 |
+
" text = soup.get_text()\n",
|
161 |
+
" # Transform the data\n",
|
162 |
+
" # Get rid of repeating \\n characters and spaces\n",
|
163 |
+
" text = text.replace(\"\\t\", \" \")\n",
|
164 |
+
" text = text.replace(\"\\n\", \" \")\n",
|
165 |
+
" text_len = len(text)\n",
|
166 |
+
" for i in range(text_len):\n",
|
167 |
+
" while i + 1 < text_len and text[i] == \" \" and text[i + 1] == \" \":\n",
|
168 |
+
" text = text[:i] + text[i + 1 :]\n",
|
169 |
+
" text_len -= 1\n",
|
170 |
+
" if \"404\" not in text:\n",
|
171 |
+
" mongoCollection.insert_one({\"link\": link, \"type\": \"Document\", \"content\": text})\n",
|
172 |
+
" else:\n",
|
173 |
+
" print(\"Page not found: \", link)\n",
|
174 |
+
" # Also crawl through all subdirectorys in the link(related links)\n",
|
175 |
+
" soup = BeautifulSoup(r.content, \"html.parser\")\n",
|
176 |
+
" subdirectories = [a.get(\"href\") for a in soup.find_all(\"a\")]\n",
|
177 |
+
" for subdirectory in subdirectories:\n",
|
178 |
+
" newLink = urllib.parse.urljoin(link, subdirectory)\n",
|
179 |
+
" if (\n",
|
180 |
+
" subdirectory is not None and\n",
|
181 |
+
" 'http' not in subdirectory and\n",
|
182 |
+
" mongoCollection.find_one({\"link\": newLink}) is None\n",
|
183 |
+
" ):\n",
|
184 |
+
" #print(\"Adding subdirectory: \", link + subdirectory)\n",
|
185 |
+
" links.append(newLink)\n",
|
186 |
+
" except:\n",
|
187 |
+
" print(\"Could not crawl link\", link)\n",
|
188 |
" else:\n",
|
189 |
" print(\"Already visited: \", link)\n",
|
190 |
" # Avoid spamming sites\n",
|
191 |
+
" time.sleep(0.1)"
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
]
|
193 |
}
|
194 |
],
|
project/FeaturePipeline.ipynb
CHANGED
@@ -2,15 +2,33 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
8 |
{
|
9 |
-
"
|
10 |
-
"
|
11 |
-
"
|
12 |
-
|
13 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
]
|
15 |
}
|
16 |
],
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 15,
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
8 |
{
|
9 |
+
"ename": "KeyboardInterrupt",
|
10 |
+
"evalue": "",
|
11 |
+
"output_type": "error",
|
12 |
+
"traceback": [
|
13 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
14 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
15 |
+
"Cell \u001b[0;32mIn[15], line 61\u001b[0m\n\u001b[1;32m 58\u001b[0m chunkNum \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m chunks:\n\u001b[1;32m 60\u001b[0m \u001b[38;5;66;03m# Create embeddings for each chunk, of length 2048 using the embedding model\u001b[39;00m\n\u001b[0;32m---> 61\u001b[0m embedding \u001b[38;5;241m=\u001b[39m \u001b[43membeddingsModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membed_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;66;03m# Store the embedding along with some metadata into the Qdrant vector database\u001b[39;00m\n\u001b[1;32m 63\u001b[0m qClient\u001b[38;5;241m.\u001b[39mupsert(collection_name\u001b[38;5;241m=\u001b[39mresultType, wait\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, points\u001b[38;5;241m=\u001b[39m[PointStruct(\u001b[38;5;28mid\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mid\u001b[39m, vector\u001b[38;5;241m=\u001b[39membedding, payload\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlink\u001b[39m\u001b[38;5;124m\"\u001b[39m: link, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: resultType, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mchunk\u001b[39m\u001b[38;5;124m\"\u001b[39m: chunkNum, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m: chunk})])\n",
|
16 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/site-packages/langchain_community/embeddings/ollama.py:227\u001b[0m, in \u001b[0;36mOllamaEmbeddings.embed_query\u001b[0;34m(self, text)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Embed a query using a Ollama deployed embedding model.\u001b[39;00m\n\u001b[1;32m 219\u001b[0m \n\u001b[1;32m 220\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 224\u001b[0m \u001b[38;5;124;03m Embeddings for the text.\u001b[39;00m\n\u001b[1;32m 225\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 226\u001b[0m instruction_pair \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mquery_instruction\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mtext\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 227\u001b[0m embedding \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_embed\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43minstruction_pair\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 228\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m embedding\n",
|
17 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/site-packages/langchain_community/embeddings/ollama.py:202\u001b[0m, in \u001b[0;36mOllamaEmbeddings._embed\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 200\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 201\u001b[0m iter_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28minput\u001b[39m\n\u001b[0;32m--> 202\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_process_emb_response\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m prompt \u001b[38;5;129;01min\u001b[39;00m iter_]\n",
|
18 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/site-packages/langchain_community/embeddings/ollama.py:167\u001b[0m, in \u001b[0;36mOllamaEmbeddings._process_emb_response\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 161\u001b[0m headers \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 162\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mContent-Type\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mapplication/json\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 163\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mheaders \u001b[38;5;129;01mor\u001b[39;00m {}),\n\u001b[1;32m 164\u001b[0m }\n\u001b[1;32m 166\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mrequests\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpost\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 168\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbase_url\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m/api/embeddings\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 169\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 170\u001b[0m \u001b[43m \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mprompt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_default_params\u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 171\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m requests\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mRequestException \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mError raised by inference endpoint: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
|
19 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/site-packages/requests/api.py:115\u001b[0m, in \u001b[0;36mpost\u001b[0;34m(url, data, json, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpost\u001b[39m(url, data\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, json\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 104\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"Sends a POST request.\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \n\u001b[1;32m 106\u001b[0m \u001b[38;5;124;03m :param url: URL for the new :class:`Request` object.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;124;03m :rtype: requests.Response\u001b[39;00m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 115\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpost\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjson\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
20 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/site-packages/requests/api.py:59\u001b[0m, in \u001b[0;36mrequest\u001b[0;34m(method, url, **kwargs)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;66;03m# By using the 'with' statement we are sure the session is closed, thus we\u001b[39;00m\n\u001b[1;32m 56\u001b[0m \u001b[38;5;66;03m# avoid leaving sockets open which can trigger a ResourceWarning in some\u001b[39;00m\n\u001b[1;32m 57\u001b[0m \u001b[38;5;66;03m# cases, and look like a memory leak in others.\u001b[39;00m\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m sessions\u001b[38;5;241m.\u001b[39mSession() \u001b[38;5;28;01mas\u001b[39;00m session:\n\u001b[0;32m---> 59\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
21 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/site-packages/requests/sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 584\u001b[0m send_kwargs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 585\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m: timeout,\n\u001b[1;32m 586\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m: allow_redirects,\n\u001b[1;32m 587\u001b[0m }\n\u001b[1;32m 588\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 589\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n",
|
22 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/site-packages/requests/sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 700\u001b[0m start \u001b[38;5;241m=\u001b[39m preferred_clock()\n\u001b[1;32m 702\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43madapter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 705\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[1;32m 706\u001b[0m elapsed \u001b[38;5;241m=\u001b[39m preferred_clock() \u001b[38;5;241m-\u001b[39m start\n",
|
23 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/site-packages/requests/adapters.py:667\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 664\u001b[0m timeout \u001b[38;5;241m=\u001b[39m TimeoutSauce(connect\u001b[38;5;241m=\u001b[39mtimeout, read\u001b[38;5;241m=\u001b[39mtimeout)\n\u001b[1;32m 666\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 667\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 668\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 669\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 670\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 671\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 672\u001b[0m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 673\u001b[0m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 674\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 675\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 676\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 677\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 678\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 679\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 681\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m 682\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(err, request\u001b[38;5;241m=\u001b[39mrequest)\n",
|
24 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/site-packages/urllib3/connectionpool.py:789\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[0m\n\u001b[1;32m 786\u001b[0m response_conn \u001b[38;5;241m=\u001b[39m conn \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m release_conn \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 788\u001b[0m \u001b[38;5;66;03m# Make the request on the HTTPConnection object\u001b[39;00m\n\u001b[0;32m--> 789\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 790\u001b[0m \u001b[43m \u001b[49m\u001b[43mconn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 791\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 792\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 793\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 794\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 795\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 796\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 797\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 798\u001b[0m \u001b[43m \u001b[49m\u001b[43mresponse_conn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresponse_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 799\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreload_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 800\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 801\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mresponse_kw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 802\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 804\u001b[0m \u001b[38;5;66;03m# Everything went great!\u001b[39;00m\n\u001b[1;32m 805\u001b[0m clean_exit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
|
25 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/site-packages/urllib3/connectionpool.py:536\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[0m\n\u001b[1;32m 534\u001b[0m \u001b[38;5;66;03m# Receive the response from the server\u001b[39;00m\n\u001b[1;32m 535\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 536\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 537\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (BaseSSLError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 538\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_timeout(err\u001b[38;5;241m=\u001b[39me, url\u001b[38;5;241m=\u001b[39murl, timeout_value\u001b[38;5;241m=\u001b[39mread_timeout)\n",
|
26 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/site-packages/urllib3/connection.py:507\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 504\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mresponse\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HTTPResponse\n\u001b[1;32m 506\u001b[0m \u001b[38;5;66;03m# Get the response from http.client.HTTPConnection\u001b[39;00m\n\u001b[0;32m--> 507\u001b[0m httplib_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 509\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 510\u001b[0m assert_header_parsing(httplib_response\u001b[38;5;241m.\u001b[39mmsg)\n",
|
27 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/http/client.py:1428\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1426\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1427\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1428\u001b[0m \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbegin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1429\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m:\n\u001b[1;32m 1430\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n",
|
28 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/http/client.py:331\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 329\u001b[0m \u001b[38;5;66;03m# read until we get a non-100 response\u001b[39;00m\n\u001b[1;32m 330\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 331\u001b[0m version, status, reason \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_read_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 332\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status \u001b[38;5;241m!=\u001b[39m CONTINUE:\n\u001b[1;32m 333\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n",
|
29 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/http/client.py:292\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_read_status\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 292\u001b[0m line \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_MAXLINE\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miso-8859-1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 293\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(line) \u001b[38;5;241m>\u001b[39m _MAXLINE:\n\u001b[1;32m 294\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m LineTooLong(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus line\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
30 |
+
"File \u001b[0;32m/usr/local/lib/python3.12/socket.py:720\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 720\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 721\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[1;32m 722\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
|
31 |
+
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
32 |
]
|
33 |
}
|
34 |
],
|
project/Tools/QdrantTools.ipynb
CHANGED
@@ -2,57 +2,17 @@
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
8 |
{
|
9 |
"name": "stdout",
|
10 |
"output_type": "stream",
|
11 |
"text": [
|
12 |
-
"
|
13 |
-
"
|
14 |
-
"id=2 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 2, 'text': 'for ROS 2 Stack Exchange Ask questions. Get answers. Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Documentation Documentation and tutorials for ROS 2 Robotics Stack Exchange Ask questions.Get answers.All ROS versions Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Recent Updates and Highlights ROSCon 2024 Videos are Now Available See the ROSCon 2024 website for details 11/18/2024 - Katherine Scott The'} vector=None shard_key=None order_value=None\n",
|
15 |
-
"id=3 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 3, 'text': 'Katherine Scott The videos from ROSCon 2024 in Odense are now available on the ROSCon Website (see the program), this Vimeo showcase, and in the ROS documentation. The ROSCon website also includes the slides from all the talks at ROSCon. I have also included a list of all the videos below. I want to thank AMD for being our 2024 ROSCon video sponsor, their generous support makes the ROSCon live stream and videos possible. READ MORE Recent ROS Discourse Posts ROS News of the Week 11/22/2024 - ROS'} vector=None shard_key=None order_value=None\n",
|
16 |
-
"id=4 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 4, 'text': '11/22/2024 - ROS Discourse Gazebo Classic and Citadel End of Life 12/2/2024 - ROS Discourse ROS 2 driver for Mitsubishi Melfa RV-FR 10/24/2024 ROS Discourse Home Why ROS? Getting Started Community Ecosystem Q&A Forum Packages Wiki Documentation media Q&A Forum Packages ROSCon Wiki documentation discord Brought to you by Open Robotics | licensed under Creative Commons Attributions 3.0 | 2021 Open Robotics'} vector=None shard_key=None order_value=None\n",
|
17 |
-
"Number of document chunks: 5\n",
|
18 |
"\n",
|
19 |
-
"Sample
|
20 |
-
"id=0 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 0, 'text': 'ROS: Home Why ROS? Getting Started Community Ecosystem ROS - Robot Operating System The Robot Operating System (ROS) is a set of software libraries and tools that help you build robot applications. From drivers to state-of-the-art algorithms, and with powerful developer tools, ROS has what you need for your next robotics project. And it\\'s all open source. What is ROS? ROS Videos \" Install Jazzy Jalisco Jazzy Jalisco is our latest ROS 2 LTS release targeted at the Ubuntu 24.04 (Noble) and'} vector=None shard_key=None order_value=None \n",
|
21 |
-
"\n",
|
22 |
-
"id=0 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 0, 'text': \"#About TheRobotOperatingSystem(ROS)isasetofsoftwarelibrariesandtoolsthathelpyoubuildrobotapplications. Fromdriverstostate-of-the-artalgorithms,andwithpowerfuldevelopertools,ROShaswhatyouneedforyournextroboticsproject. Andit'sallopensource. Fullprojectdetailson[ROS.org](https://ros.org/) #GettingStarted LookingtogetstartedwithROS? Our[installationguideishere](https://www.ros.org/blog/getting-started/).\"} vector=None shard_key=None order_value=None\n",
|
23 |
-
"id=1 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 1, 'text': \"Onceyou'veinstalledROSstartbylearningsome[basicconcepts](https://docs.ros.org/en/rolling/Concepts/Basic.html)andtakealookatour[beginnertutorials](https://docs.ros.org/en/rolling/Tutorials/Beginner-CLI-Tools.html). #JointheROSCommunity ##CommunityResources *[ROSDiscussionForum](https://discourse.ros.org/) *[ROSDiscordServer](https://discord.com/servers/open-robotics-1077825543698927656) *[RoboticsStackExchange](https://robotics.stackexchange.com/)(preferredROSsupportforum).\"} vector=None shard_key=None order_value=None\n",
|
24 |
-
"id=2 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 2, 'text': '*[OfficialROSVideos](https://vimeo.com/osrfoundation) *[ROSCon](https://roscon.ros.org),ouryearlydeveloperconference. *CiteROS2inacademicworkusing[DOI:10.1126/scirobotics.abm6074](https://www.science.org/doi/10.1126/scirobotics.abm6074) ##DeveloperResources *[ROS2Documentation](https://docs.ros.org/) *[ROSPackageAPIreference](https://docs.ros.org/en/rolling/p/) *[ROSPackageIndex](https://index.ros.org/) *[ROSonDockerHub](https://hub.docker.com/_/ros/)'} vector=None shard_key=None order_value=None\n",
|
25 |
-
"id=3 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 3, 'text': '*[ROSResourceStatusPage](https://status.openrobotics.org/) *[REP-2000](https://ros.org/reps/rep-2000.html):ROS2ReleasesandTargetPlatforms ##ProjectResources *[PurchaseROSSwag](https://spring.ros.org/) *[InformationabouttheROSTrademark](https://www.ros.org/blog/media/) *OnSocialMedia *[OpenRoboticsonLinkedIn](https://www.linkedin.com/company/open-source-robotics-foundation) *[OpenRoboticsonTwitter](https://twitter.com/OpenRoboticsOrg) *[ROS.orgonTwitter](https://twitter.com/ROSOrg)'} vector=None shard_key=None order_value=None\n",
|
26 |
-
"id=4 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 4, 'text': 'ROSismadepossiblethroughthegeneroussupportofopensourcecontributorsandthenon-profit[OpenSourceRoboticsFoundation(OSRF)](https://www.openrobotics.org/). TaxdeductibledonationstotheOSRFcanbe[madehere.](https://donorbox.org/support-open-robotics?utm_medium=qrcode&utm_source=qrcode)'} vector=None shard_key=None order_value=None\n",
|
27 |
-
"id=5 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/.gitignore', 'type': 'Github', 'chunk': 0, 'text': '#Ignoredefaultnamesforcolconcreatedfolders build install log #Ignoreeverythinginsrcexcepta.gitkeepfile src/* !src/.gitkeep'} vector=None shard_key=None order_value=None\n",
|
28 |
-
"id=6 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/CODEOWNERS', 'type': 'Github', 'chunk': 0, 'text': '#Thisfilewasgeneratedbyhttps://github.com/audrow/update-ros2-repos *@clalancette@codebot'} vector=None shard_key=None order_value=None\n",
|
29 |
-
"id=7 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'type': 'Github', 'chunk': 0, 'text': 'repositories: ament/ament_cmake: type:git url:https://github.com/ament/ament_cmake.git version:rolling ament/ament_index: type:git url:https://github.com/ament/ament_index.git version:rolling ament/ament_lint: type:git url:https://github.com/ament/ament_lint.git version:rolling ament/ament_package: type:git url:https://github.com/ament/ament_package.git version:rolling ament/google_benchmark_vendor: type:git url:https://github.com/ament/google_benchmark_vendor.git version:rolling'} vector=None shard_key=None order_value=None\n",
|
30 |
-
"id=8 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'type': 'Github', 'chunk': 1, 'text': 'version:rolling ament/googletest: type:git url:https://github.com/ament/googletest.git version:rolling ament/uncrustify_vendor: type:git url:https://github.com/ament/uncrustify_vendor.git version:rolling eProsima/Fast-CDR: type:git url:https://github.com/eProsima/Fast-CDR.git version:2.2.x eProsima/Fast-DDS: type:git url:https://github.com/eProsima/Fast-DDS.git version:2.14.x eProsima/foonathan_memory_vendor: type:git url:https://github.com/eProsima/foonathan_memory_vendor.git version:master'} vector=None shard_key=None order_value=None\n",
|
31 |
-
"id=9 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/ros2.repos', 'type': 'Github', 'chunk': 2, 'text': 'version:master eclipse-cyclonedds/cyclonedds: type:git url:https://github.com/eclipse-cyclonedds/cyclonedds.git version:releases/0.10.x eclipse-iceoryx/iceoryx: type:git url:https://github.com/eclipse-iceoryx/iceoryx.git version:release_2.0 gazebo-release/gz_cmake_vendor: type:git url:https://github.com/gazebo-release/gz_cmake_vendor.git version:rolling gazebo-release/gz_math_vendor: type:git url:https://github.com/gazebo-release/gz_math_vendor.git version:rolling'} vector=None shard_key=None order_value=None\n",
|
32 |
-
"\n",
|
33 |
-
"Number of Github chunks: 10\n",
|
34 |
-
"\n",
|
35 |
-
"Sample Github chunk(metadata not the vector): \n",
|
36 |
-
"id=0 payload={'link': 'https://github.com/ros2/ros2/tree/rolling/README.md', 'type': 'Github', 'chunk': 0, 'text': \"#About TheRobotOperatingSystem(ROS)isasetofsoftwarelibrariesandtoolsthathelpyoubuildrobotapplications. Fromdriverstostate-of-the-artalgorithms,andwithpowerfuldevelopertools,ROShaswhatyouneedforyournextroboticsproject. Andit'sallopensource. Fullprojectdetailson[ROS.org](https://ros.org/) #GettingStarted LookingtogetstartedwithROS? Our[installationguideishere](https://www.ros.org/blog/getting-started/).\"} vector=None shard_key=None order_value=None \n",
|
37 |
-
"\n"
|
38 |
-
]
|
39 |
-
},
|
40 |
-
{
|
41 |
-
"name": "stderr",
|
42 |
-
"output_type": "stream",
|
43 |
-
"text": [
|
44 |
-
"/workspaces/RAG_LLM/project/Tools/shared.py:57: LangChainDeprecationWarning: The class `OllamaEmbeddings` was deprecated in LangChain 0.3.1 and will be removed in 1.0.0. An updated version of the class exists in the :class:`~langchain-ollama package and should be used instead. To use it run `pip install -U :class:`~langchain-ollama` and import as `from :class:`~langchain_ollama import OllamaEmbeddings``.\n",
|
45 |
-
" return OllamaEmbeddings(model=MODEL, base_url=\"http://host.docker.internal:11434\")\n"
|
46 |
-
]
|
47 |
-
},
|
48 |
-
{
|
49 |
-
"name": "stdout",
|
50 |
-
"output_type": "stream",
|
51 |
-
"text": [
|
52 |
-
"\n",
|
53 |
-
"Sample search result(n=2): \n",
|
54 |
-
"id=4 version=4 score=0.38799083 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 4, 'text': '11/22/2024 - ROS Discourse Gazebo Classic and Citadel End of Life 12/2/2024 - ROS Discourse ROS 2 driver for Mitsubishi Melfa RV-FR 10/24/2024 ROS Discourse Home Why ROS? Getting Started Community Ecosystem Q&A Forum Packages Wiki Documentation media Q&A Forum Packages ROSCon Wiki documentation discord Brought to you by Open Robotics | licensed under Creative Commons Attributions 3.0 | 2021 Open Robotics'} vector=None shard_key=None order_value=None\n",
|
55 |
-
"id=2 version=2 score=0.35047314 payload={'link': 'https://www.ros.org/', 'type': 'Document', 'chunk': 2, 'text': 'for ROS 2 Stack Exchange Ask questions. Get answers. Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Documentation Documentation and tutorials for ROS 2 Robotics Stack Exchange Ask questions.Get answers.All ROS versions Forums Hear the latest discussions ROS 1 Wiki Legacy documentation and tutorials for ROS 1 Recent Updates and Highlights ROSCon 2024 Videos are Now Available See the ROSCon 2024 website for details 11/18/2024 - Katherine Scott The'} vector=None shard_key=None order_value=None\n"
|
56 |
]
|
57 |
}
|
58 |
],
|
@@ -63,14 +23,15 @@
|
|
63 |
"# Show everything in the Document collection\n",
|
64 |
"numDocumentChunks = 0\n",
|
65 |
"# Note with_vectors defaults to false, so the vectors are not returned\n",
|
66 |
-
"chunks = qClient.scroll(collection_name='Document')\n",
|
67 |
-
"
|
68 |
-
"for chunk in chunks[0]:\n",
|
69 |
-
"
|
70 |
-
"
|
71 |
-
"
|
72 |
-
"
|
73 |
-
"
|
|
|
74 |
"print(\"Number of document chunks: \", numDocumentChunks)\n",
|
75 |
"if numDocumentChunks > 0:\n",
|
76 |
" print(\"\\nSample document chunk(metadata not the vector): \")\n",
|
@@ -78,17 +39,19 @@
|
|
78 |
"\n",
|
79 |
"# Show everything in the Github collection\n",
|
80 |
"numGithubChunks = 0\n",
|
81 |
-
"
|
82 |
-
"
|
83 |
-
"
|
84 |
-
"
|
85 |
-
"
|
86 |
-
"
|
87 |
-
"
|
88 |
-
"
|
89 |
-
"
|
|
|
|
|
90 |
"if numGithubChunks > 0:\n",
|
91 |
-
" print(\"\\nSample
|
92 |
" print(sampleGithubChunk, '\\n')\n",
|
93 |
"\n",
|
94 |
"# Show a sample search\n",
|
@@ -96,7 +59,7 @@
|
|
96 |
"results = qClient.search(\n",
|
97 |
" collection_name=\"Document\",\n",
|
98 |
" query_vector = embeddingsModel.embed_query(\"What operating system is ROS made for?\"),\n",
|
99 |
-
" limit=
|
100 |
")\n",
|
101 |
"print(\"\\nSample search result(n=2): \")\n",
|
102 |
"for result in results:\n",
|
@@ -105,7 +68,7 @@
|
|
105 |
},
|
106 |
{
|
107 |
"cell_type": "code",
|
108 |
-
"execution_count":
|
109 |
"metadata": {},
|
110 |
"outputs": [
|
111 |
{
|
@@ -137,7 +100,7 @@
|
|
137 |
},
|
138 |
{
|
139 |
"cell_type": "code",
|
140 |
-
"execution_count":
|
141 |
"metadata": {},
|
142 |
"outputs": [
|
143 |
{
|
@@ -146,7 +109,7 @@
|
|
146 |
"True"
|
147 |
]
|
148 |
},
|
149 |
-
"execution_count":
|
150 |
"metadata": {},
|
151 |
"output_type": "execute_result"
|
152 |
}
|
|
|
2 |
"cells": [
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
+
"execution_count": 4,
|
6 |
"metadata": {},
|
7 |
"outputs": [
|
8 |
{
|
9 |
"name": "stdout",
|
10 |
"output_type": "stream",
|
11 |
"text": [
|
12 |
+
"Number of document chunks: 0\n",
|
13 |
+
"Number of githb chunks: 0\n",
|
|
|
|
|
|
|
|
|
14 |
"\n",
|
15 |
+
"Sample search result(n=2): \n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
]
|
17 |
}
|
18 |
],
|
|
|
23 |
"# Show everything in the Document collection\n",
|
24 |
"numDocumentChunks = 0\n",
|
25 |
"# Note with_vectors defaults to false, so the vectors are not returned\n",
|
26 |
+
"chunks = qClient.scroll(collection_name='Document', limit=100)\n",
|
27 |
+
"while True:\n",
|
28 |
+
" for chunk in chunks[0]:\n",
|
29 |
+
" if numDocumentChunks == 0:\n",
|
30 |
+
" sampleDocumentChunk = chunk\n",
|
31 |
+
" numDocumentChunks += 1\n",
|
32 |
+
" chunks = qClient.scroll(collection_name='Document', limit=100, with_payload=False, offset=chunks[1])\n",
|
33 |
+
" if chunks[1] is None:\n",
|
34 |
+
" break\n",
|
35 |
"print(\"Number of document chunks: \", numDocumentChunks)\n",
|
36 |
"if numDocumentChunks > 0:\n",
|
37 |
" print(\"\\nSample document chunk(metadata not the vector): \")\n",
|
|
|
39 |
"\n",
|
40 |
"# Show everything in the Github collection\n",
|
41 |
"numGithubChunks = 0\n",
|
42 |
+
"# Note with_vectors defaults to false, so the vectors are not returned\n",
|
43 |
+
"chunks = qClient.scroll(collection_name='Github', limit=100)\n",
|
44 |
+
"while True:\n",
|
45 |
+
" for chunk in chunks[0]:\n",
|
46 |
+
" if numGithubChunks == 0:\n",
|
47 |
+
" sampleGithubChunk = chunk\n",
|
48 |
+
" numGithubChunks += 1\n",
|
49 |
+
" chunks = qClient.scroll(collection_name='Github', limit=100, with_payload=False, offset=chunks[1])\n",
|
50 |
+
" if chunks[1] is None:\n",
|
51 |
+
" break\n",
|
52 |
+
"print(\"Number of githb chunks: \", numDocumentChunks)\n",
|
53 |
"if numGithubChunks > 0:\n",
|
54 |
+
" print(\"\\nSample github chunk(metadata not the vector): \")\n",
|
55 |
" print(sampleGithubChunk, '\\n')\n",
|
56 |
"\n",
|
57 |
"# Show a sample search\n",
|
|
|
59 |
"results = qClient.search(\n",
|
60 |
" collection_name=\"Document\",\n",
|
61 |
" query_vector = embeddingsModel.embed_query(\"What operating system is ROS made for?\"),\n",
|
62 |
+
" limit=10\n",
|
63 |
")\n",
|
64 |
"print(\"\\nSample search result(n=2): \")\n",
|
65 |
"for result in results:\n",
|
|
|
68 |
},
|
69 |
{
|
70 |
"cell_type": "code",
|
71 |
+
"execution_count": 22,
|
72 |
"metadata": {},
|
73 |
"outputs": [
|
74 |
{
|
|
|
100 |
},
|
101 |
{
|
102 |
"cell_type": "code",
|
103 |
+
"execution_count": 3,
|
104 |
"metadata": {},
|
105 |
"outputs": [
|
106 |
{
|
|
|
109 |
"True"
|
110 |
]
|
111 |
},
|
112 |
+
"execution_count": 3,
|
113 |
"metadata": {},
|
114 |
"output_type": "execute_result"
|
115 |
}
|
project/Tools/mongoTools.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|