File size: 3,703 Bytes
b7064d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import shutil
import git
from urllib.parse import urlparse

local_dir = os.getcwd()
branch = None

# Function to extract repository name from URL
def get_repo_name(url):
    parsed_url = urlparse(url)
    # Extract the base name from the path (which is usually the repository name)
    repo_name = os.path.basename(parsed_url.path)
    # Remove the ".git" extension if it exists
    repo_name = repo_name[:-4]
    return repo_name

# Function to clone a Git repository
def clone_repo(url):
   try:
        path = os.path.join(local_dir,"staging",get_repo_name(url))
        # Check if the repository already exists in the specified path
        if os.path.exists(path):
           print(f"{get_repo_name(url)} already added in db")
           return False
    
        repo = git.Repo.clone_from(url,path)
        global branch 
        branch = repo.head.reference
        print(f"{get_repo_name(url)} cloned succesfully")
        return True
   except Exception as e : 
       print(f"Error cloning the git repository: {e}")
       return False

def delete_cloned_repo(url):
    local_path = os.path.join(local_dir,"staging",get_repo_name(url))
    try:
        # Check if the local path exists
        if os.path.exists(local_path):
            # Use shutil.rmtree to remove the entire directory
            shutil.rmtree(local_path,ignore_errors=True)
            print(f"Repository at {local_path} successfully deleted.")
        else:
            print(f"Repository at {local_path} does not exist.")
    except Exception as e:
        print(f"Error deleting repository: {e}")

from langchain_community.document_loaders import GitLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
import qdrant_client

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 20,
)

# from langchain_together.embeddings import TogetherEmbeddings
# embeddings2 = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-8k-retrieval",together_api_key="d8ec7106bd0c268bf4672dba83272b86054fbe849eba82f3f75ceb17e6d57eb0")

client = qdrant_client.QdrantClient(
    os.getenv("QDRANT_HOST"),
    api_key=os.getenv("QDRANT_API_KEY")
)

from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-small-en-v1.5")
vectorstore = None

def load_repo(url):
    collection_config = qdrant_client.http.models.VectorParams(
        size=384, # 768 for instructor-xl, 1536 for OpenAI
        distance=qdrant_client.http.models.Distance.COSINE
    )

    client.recreate_collection(
    collection_name=get_repo_name(url),
    vectors_config=collection_config
    )
    vectorstore = Qdrant(
    client=client,
    collection_name=get_repo_name(url),
    embeddings=embeddings
    )
    print("collection created")
    try:
        loader = GitLoader(repo_path=os.path.join(local_dir,"staging",get_repo_name(url)), branch=branch, file_filter=lambda file_path: not file_path.endswith("package-lock.json"),)
        data = loader.load()
        chunks = text_splitter.split_documents(data)
        print("chunks created")
        vectorstore.add_documents(chunks)
        return True
    except Exception as e:
        print(f"Error loading and indexing repository: {e}") 
        return False
    
def repository_loader(url):
    result = False
    if(clone_repo(url)):
        result = load_repo(url)
    if result :
        delete_cloned_repo(url)



print('HELLO FROM CONTAINER')
#answer_query("How is the routing done in this project and what are the routes used",'https://github.com/s0ham075/Google-Docs-Frontend.git')

# delete_cloned_repo()