File size: 4,370 Bytes
dc56243
 
 
 
 
 
5d9493d
 
dc56243
5d9493d
 
dc56243
5d9493d
dc56243
5d9493d
dc56243
 
178a171
 
ad91929
 
dc56243
ad91929
5d9493d
5d7e380
 
5d9493d
ad91929
dc56243
5d9493d
 
dc56243
5d9493d
dc56243
5d7e380
5d9493d
 
bdd7b82
5d7e380
 
 
 
bdd7b82
 
5d7e380
 
bdd7b82
 
5d9493d
 
 
 
 
 
5d7e380
5d9493d
 
dc56243
5d9493d
5d7e380
5d9493d
dc56243
5d9493d
 
 
 
dc56243
 
ad91929
5d7e380
 
 
 
 
 
 
5d9493d
 
 
 
 
5d7e380
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc56243
 
5d9493d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import requests
from datasets import load_dataset

# Set page configuration
st.set_page_config(page_title="Repository Recommender", layout="wide")

# Load model and tokenizer
@st.cache_resource
def load_model():
    model_name = "Salesforce/codet5-small"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Check if GPU is available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModel.from_pretrained(model_name).to(device)
    return tokenizer, model, device

def generate_embedding(text, tokenizer, model, device):
    """Generate embeddings for a given text."""
    if not text.strip():
        return np.zeros(512)  # Handle empty input gracefully
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.encoder(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Load dataset
@st.cache_data
def load_data(_tokenizer, _model, _device):
    dataset = load_dataset("frankjosh/filtered_dataset", split="train")
    df = pd.DataFrame(dataset).head(500)  # Limit to 500 repositories

    # Fill missing values to avoid errors
    df['docstring'] = df.get('docstring', "").fillna("")
    df['summary'] = df.get('summary', "").fillna("")

    # Generate embeddings for each row
    def compute_embedding(row):
        text = f"{row['docstring']} {row['summary']}"
        return generate_embedding(text, _tokenizer, _model, _device)

    df['embedding'] = df.apply(compute_embedding, axis=1)
    return df

def fetch_readme(repo_url):
    """Fetch README file from GitHub repository."""
    try:
        readme_url = repo_url.rstrip("/") + "/blob/main/README.md"
        response = requests.get(readme_url, timeout=10)
        if response.status_code == 200:
            return response.text
        else:
            return "README not available."
    except requests.exceptions.RequestException as e:
        return f"Error fetching README: {e}"

# Main application logic
def main():
    st.title("Repository Recommender System")
    st.write("Find Python repositories to learn production-level coding practices.")

    # Load resources
    tokenizer, model, device = load_model()

    with st.spinner("Loading dataset and generating embeddings. This may take a moment..."):
        try:
            data = load_data(tokenizer, model, device)
        except Exception as e:
            st.error(f"Error loading dataset: {e}")
            return

    # Input user query
    user_query = st.text_input("Describe your project or learning goal:",
                               "I am working on a project to recommend music using pandas and numpy.")
    if user_query:
        with st.spinner("Processing your query..."):
            query_embedding = generate_embedding(user_query, tokenizer, model, device)

            # Compute similarity
            try:
                data['similarity'] = data['embedding'].apply(
                    lambda emb: cosine_similarity([query_embedding], [np.array(emb)])[0][0]
                )

                # Filter and sort recommendations
                top_recommendations = (
                    data.sort_values(by='similarity', ascending=False)
                    .head(5)
                )

                # Display recommendations
                st.subheader("Top Recommendations")
                for idx, row in top_recommendations.iterrows():
                    st.markdown(f"### {row['repo']}")
                    st.write(f"**Path:** {row['path']}")
                    st.write(f"**Summary:** {row['summary']}")
                    st.write(f"**Similarity Score:** {row['similarity']:.2f}")
                    st.markdown(f"[Repository Link]({row['url']})")

                    # Fetch and display README
                    st.subheader("Repository README")
                    readme_content = fetch_readme(row['url'])
                    st.code(readme_content)

            except Exception as e:
                st.error(f"Error computing recommendations: {e}")

if __name__ == "__main__":
    main()