Spaces:
Sleeping
Sleeping
File size: 4,370 Bytes
dc56243 5d9493d dc56243 5d9493d dc56243 5d9493d dc56243 5d9493d dc56243 178a171 ad91929 dc56243 ad91929 5d9493d 5d7e380 5d9493d ad91929 dc56243 5d9493d dc56243 5d9493d dc56243 5d7e380 5d9493d bdd7b82 5d7e380 bdd7b82 5d7e380 bdd7b82 5d9493d 5d7e380 5d9493d dc56243 5d9493d 5d7e380 5d9493d dc56243 5d9493d dc56243 ad91929 5d7e380 5d9493d 5d7e380 dc56243 5d9493d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import requests
from datasets import load_dataset
# Set page configuration
st.set_page_config(page_title="Repository Recommender", layout="wide")
# Load model and tokenizer
@st.cache_resource
def load_model():
model_name = "Salesforce/codet5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModel.from_pretrained(model_name).to(device)
return tokenizer, model, device
def generate_embedding(text, tokenizer, model, device):
"""Generate embeddings for a given text."""
if not text.strip():
return np.zeros(512) # Handle empty input gracefully
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model.encoder(**inputs)
return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
# Load dataset
@st.cache_data
def load_data(_tokenizer, _model, _device):
dataset = load_dataset("frankjosh/filtered_dataset", split="train")
df = pd.DataFrame(dataset).head(500) # Limit to 500 repositories
# Fill missing values to avoid errors
df['docstring'] = df.get('docstring', "").fillna("")
df['summary'] = df.get('summary', "").fillna("")
# Generate embeddings for each row
def compute_embedding(row):
text = f"{row['docstring']} {row['summary']}"
return generate_embedding(text, _tokenizer, _model, _device)
df['embedding'] = df.apply(compute_embedding, axis=1)
return df
def fetch_readme(repo_url):
"""Fetch README file from GitHub repository."""
try:
readme_url = repo_url.rstrip("/") + "/blob/main/README.md"
response = requests.get(readme_url, timeout=10)
if response.status_code == 200:
return response.text
else:
return "README not available."
except requests.exceptions.RequestException as e:
return f"Error fetching README: {e}"
# Main application logic
def main():
st.title("Repository Recommender System")
st.write("Find Python repositories to learn production-level coding practices.")
# Load resources
tokenizer, model, device = load_model()
with st.spinner("Loading dataset and generating embeddings. This may take a moment..."):
try:
data = load_data(tokenizer, model, device)
except Exception as e:
st.error(f"Error loading dataset: {e}")
return
# Input user query
user_query = st.text_input("Describe your project or learning goal:",
"I am working on a project to recommend music using pandas and numpy.")
if user_query:
with st.spinner("Processing your query..."):
query_embedding = generate_embedding(user_query, tokenizer, model, device)
# Compute similarity
try:
data['similarity'] = data['embedding'].apply(
lambda emb: cosine_similarity([query_embedding], [np.array(emb)])[0][0]
)
# Filter and sort recommendations
top_recommendations = (
data.sort_values(by='similarity', ascending=False)
.head(5)
)
# Display recommendations
st.subheader("Top Recommendations")
for idx, row in top_recommendations.iterrows():
st.markdown(f"### {row['repo']}")
st.write(f"**Path:** {row['path']}")
st.write(f"**Summary:** {row['summary']}")
st.write(f"**Similarity Score:** {row['similarity']:.2f}")
st.markdown(f"[Repository Link]({row['url']})")
# Fetch and display README
st.subheader("Repository README")
readme_content = fetch_readme(row['url'])
st.code(readme_content)
except Exception as e:
st.error(f"Error computing recommendations: {e}")
if __name__ == "__main__":
main()
|