Spaces:
Running
Running
import streamlit as st | |
import os | |
from github import Github | |
from langchain_community.vectorstores import Chroma | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from openai import OpenAI | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
openai_api_key = os.getenv("OPENAI_API_KEY") | |
# Function to fetch repository data from GitHub | |
def fetch_github_repo_data(repo_name, github_token): | |
"""Fetch all text content from a GitHub repository.""" | |
try: | |
g = Github(github_token) | |
repo = g.get_repo(repo_name) | |
contents = repo.get_contents("") | |
repo_data = "" | |
while contents: | |
file_content = contents.pop(0) | |
if file_content.type == "dir": | |
contents.extend(repo.get_contents(file_content.path)) | |
else: | |
try: | |
file_data = repo.get_contents(file_content.path).decoded_content | |
text = file_data.decode("utf-8") | |
repo_data += f"\n\nFile: {file_content.path}\n{text}" | |
except UnicodeDecodeError: | |
# Skip non-text files | |
continue | |
return repo_data | |
except Exception as e: | |
st.error(f"Error fetching GitHub repository data: {e}") | |
return None | |
# Function to generate a response using OpenAI | |
def generate_response(context, question): | |
"""Generate a response using OpenAI.""" | |
try: | |
from openai import OpenAI | |
client = OpenAI(api_key=openai_api_key) | |
messages = [ | |
{"role": "system", "content": "You are an assistant that answers questions based on repository content."}, | |
{"role": "user", "content": f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"} | |
] | |
response = client.chat.completions.create( | |
model="gpt-4o-mini", | |
messages=messages, | |
max_tokens=150, | |
) | |
return response.choices[0].message.content.strip() | |
except Exception as e: | |
st.error(f"Error generating response: {e}") | |
return None | |
# Function to perform RAG using OpenAI and Chroma | |
def perform_rag(repo_data, question): | |
"""Perform retrieval-augmented generation using ChromaDB and OpenAI.""" | |
try: | |
if not repo_data: | |
st.warning("Repository data is empty.") | |
return None | |
# Create embeddings | |
embeddings = HuggingFaceEmbeddings() | |
# Split text into chunks | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, chunk_overlap=20, length_function=len | |
) | |
chunks = text_splitter.create_documents([repo_data]) | |
# Store chunks in ChromaDB | |
persist_directory = "github_repo_embeddings" | |
vectordb = Chroma.from_documents( | |
documents=chunks, embedding=embeddings, persist_directory=persist_directory | |
) | |
vectordb.persist() | |
# Load persisted Chroma database | |
vectordb = Chroma( | |
persist_directory=persist_directory, embedding_function=embeddings | |
) | |
# Perform retrieval using Chroma | |
docs = vectordb.similarity_search(question) | |
if not docs: | |
st.warning("No relevant documents found.") | |
return None | |
context = docs[0].page_content | |
return generate_response(context, question) | |
except Exception as e: | |
st.error(f"Error performing RAG: {e}") | |
return None | |
# Streamlit application | |
def main(): | |
st.title("Chat with GitHub Repository") | |
st.caption("This app allows you to interact with a GitHub repository using OpenAI and ChromaDB.") | |
# Get user inputs | |
github_token = st.text_input("Enter your GitHub Token", type="password") | |
git_repo = st.text_input("Enter the GitHub Repo (owner/repo)") | |
if github_token and git_repo: | |
repo_data = fetch_github_repo_data(git_repo, github_token) | |
if repo_data: | |
st.success(f"Successfully added {git_repo} to the knowledge base!") | |
question = st.text_input("Ask any question about the repository") | |
if question: | |
answer = perform_rag(repo_data, question) | |
if answer: | |
st.subheader("Generated Answer:") | |
st.write(answer) | |
else: | |
st.error("Failed to fetch repository data. Ensure the repository name and token are correct.") | |
if __name__ == "__main__": | |
main() | |