File size: 7,857 Bytes
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
4779f10
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
4779f10
2af0eb7
 
 
 
 
 
 
 
 
 
4779f10
 
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4779f10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24ff9b2
69896d0
 
24ff9b2
 
4779f10
 
 
 
2af0eb7
4779f10
2af0eb7
 
 
 
 
 
 
 
 
 
 
 
4779f10
2af0eb7
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# See README for more info on how the DataCollectionPipeline works
# The ETL pipeline is part of the DataCollectionPipeline
# Remove the time.sleep(1) line if you are sure you won't get blocked from a webpage for requesting too often
import os
import shutil
import subprocess
import sys
import tempfile
import time

import pymongo
import requests
from bs4 import BeautifulSoup
from clearml import PipelineDecorator
import urllib.parse
from dotenv import load_dotenv

# Setup ClearML
try:
    load_dotenv(override=True)
except Exception:
    load_dotenv(sys.path[1] + "/.env", override=True)
CLEARML_WEB_HOST = os.getenv("CLEARML_WEB_HOST")
CLEARML_API_HOST = os.getenv("CLEARML_API_HOST")
CLEARML_FILES_HOST = os.getenv("CLEARML_FILES_HOST")
CLEARML_API_ACCESS_KEY = os.getenv("CLEARML_API_ACCESS_KEY")
CLEARML_API_SECRET_KEY = os.getenv("CLEARML_API_SECRETKEY")

# Input into the Data Collection Pipeline is a list of links to domains
"""
links = [
    "https://www.ros.org/",
    "https://docs.nav2.org/",
    "https://moveit.ai/",
    "https://gazebosim.org/home",
    "https://github.com/ros2/ros2",
    "https://github.com/ros-navigation/navigation2",
    "https://github.com/moveit/moveit2",
    "https://github.com/gazebosim/gazebo-classic",
]
"""
links = [ "https://www.ros.org/", "https://github.com/ros2/ros2" ]


# ETL pipeline
@PipelineDecorator.component(cache=False, return_values=["documents, codes"])
def ETL_Pipeline(links):
    # Create a mongoDB connection to check for duplicates before inserting
    try:
        load_dotenv(override=True)
    except Exception:
        load_dotenv(sys.path[1] + "/.env", override=True)
    DATABASE_HOST = os.getenv("DATABASE_HOST")
    mongoHost = pymongo.MongoClient(DATABASE_HOST)
    mongoDatabase = mongoHost["twin"]

    # Extract data from links and their subdirectories(using crawlers)
    documents = []
    codes = []
    for link in links:
        # Web scraper/crawler for github links
        if "https://github.com" in link:
            # Do not revisit a link already in the database
            mongoCollection = mongoDatabase["Github"]
            result = mongoCollection.find_one({"link": link})
            if result is None:
                # Modified GithubCrawler from LLM-Engineer for scraping github
                local_temp = tempfile.mkdtemp()
                try:
                    os.chdir(local_temp)
                    subprocess.run(["git", "clone", link])
                    repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])
                    tree = {}
                    for root, _, files in os.walk(repo_path):
                        dir = root.replace(repo_path, "").lstrip("/")
                        if dir.startswith((".git", ".toml", ".lock", ".png")):
                            continue
                        for file in files:
                            if file.endswith((".git", ".toml", ".lock", ".png")):
                                continue
                            file_path = os.path.join(dir, file)
                            with open(
                                os.path.join(root, file), "r", errors="ignore"
                            ) as f:
                                tree[file_path] = f.read().replace(" ", "")
                except Exception:
                    print(f"Error scrapping {link}")
                finally:
                    shutil.rmtree(local_temp)
                    # Correct the link
                    r = requests.get(link)
                    soup = BeautifulSoup(r.content, "html.parser")
                    # Find the file path to any of the files in the repository
                    link_element = soup.find("a", attrs={"class": "Link--primary"})
                    path = link_element.get("href")
                    path = path.rsplit("/", 1)[0]
                    # Push all the subdirectories to mongo
                    for subdirectory in tree:
                        text = tree[subdirectory]
                        # Transform the data
                        # Get rid of repeating \n characters and spaces
                        text = text.replace("\t", " ")
                        text = text.replace("\n", " ")
                        text_len = len(text)
                        for i in range(text_len):
                            while (
                                i + 1 < text_len
                                and text[i] == " "
                                and text[i + 1] == " "
                            ):
                                text = text[:i] + text[i + 1 :]
                                text_len -= 1
                        codes.append(
                            {
                                "link": "https://github.com"
                                + path
                                + "/"
                                + subdirectory,
                                "type": "Github",
                                "content": text,
                            }
                        )
        # Web scraper/crawler for other links(Documents)
        else:
            # Do not revisit a link already in the database
            mongoCollection = mongoDatabase["Document"]
            result = mongoCollection.find_one({"link": link})
            if result is None:
                try:
                    # Get all text in the website
                    r = requests.get(link)
                    soup = BeautifulSoup(r.content, "html.parser")
                    soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"])
                    text = soup.get_text()
                    # Transform the data
                    # Get rid of repeating \n characters and spaces
                    text = text.replace("\t", " ")
                    text = text.replace("\n", " ")
                    text_len = len(text)
                    for i in range(text_len):
                        while i + 1 < text_len and text[i] == " " and text[i + 1] == " ":
                            text = text[:i] + text[i + 1 :]
                            text_len -= 1
                    if "404" not in text:
                        documents.append({"link": link, "type": "Document", "content": text})
                    # Also crawl through all subdirectorys in the link(related links)
                    soup = BeautifulSoup(r.content, "html.parser")
                    subdirectories = [a.get("href") for a in soup.find_all("a")]
                    for subdirectory in subdirectories:
                        newLink = urllib.parse.urljoin(link, subdirectory)
                        if (
                            subdirectory is not None and
                            'http' not in subdirectory and
                            '#' not in subdirectory and
                            '.zip' not in subdirectory and
                            '.pdf' not in subdirectory and
                            mongoCollection.find_one({"link": newLink}) is None and
                            newLink not in links
                        ):
                            links.append(newLink)
                except:
                    print("Could not crawl link", link)
        # Avoid spamming sites
        time.sleep(0.1)
    # Each document has a link, type(github or other), and content(text)
    mongoCollection = mongoDatabase["Document"]
    mongoCollection.insert_many(documents)
    mongoCollection = mongoDatabase["Github"]
    mongoCollection.insert_many(codes)
    return documents, codes


# Allow ClearML to monitor and run the ETL pipeline
@PipelineDecorator.pipeline(
    name="Data Collection Pipeline",
    project="RAG LLM",
    version="0.4",
)
def main():
    return ETL_Pipeline(links)


if __name__ == "__main__":
    PipelineDecorator.run_locally()
    main()