Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -22,6 +22,8 @@ def load_model():
|
|
22 |
|
23 |
def generate_embedding(text, tokenizer, model, device):
|
24 |
"""Generate embeddings for a given text."""
|
|
|
|
|
25 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
26 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
27 |
with torch.no_grad():
|
@@ -30,14 +32,18 @@ def generate_embedding(text, tokenizer, model, device):
|
|
30 |
|
31 |
# Load dataset
|
32 |
@st.cache_data
|
33 |
-
def load_data(
|
34 |
dataset = load_dataset("frankjosh/filtered_dataset", split="train")
|
35 |
df = pd.DataFrame(dataset).head(500) # Limit to 500 repositories
|
36 |
|
|
|
|
|
|
|
|
|
37 |
# Generate embeddings for each row
|
38 |
def compute_embedding(row):
|
39 |
-
text = f"{row['docstring']} {row['summary']}"
|
40 |
-
return generate_embedding(text,
|
41 |
|
42 |
df['embedding'] = df.apply(compute_embedding, axis=1)
|
43 |
return df
|
@@ -46,12 +52,12 @@ def fetch_readme(repo_url):
|
|
46 |
"""Fetch README file from GitHub repository."""
|
47 |
try:
|
48 |
readme_url = repo_url.rstrip("/") + "/blob/main/README.md"
|
49 |
-
response = requests.get(readme_url)
|
50 |
if response.status_code == 200:
|
51 |
return response.text
|
52 |
else:
|
53 |
return "README not available."
|
54 |
-
except
|
55 |
return f"Error fetching README: {e}"
|
56 |
|
57 |
# Main application logic
|
@@ -61,38 +67,49 @@ def main():
|
|
61 |
|
62 |
# Load resources
|
63 |
tokenizer, model, device = load_model()
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
# Input user query
|
67 |
user_query = st.text_input("Describe your project or learning goal:",
|
68 |
"I am working on a project to recommend music using pandas and numpy.")
|
69 |
if user_query:
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
if __name__ == "__main__":
|
98 |
main()
|
|
|
22 |
|
23 |
def generate_embedding(text, tokenizer, model, device):
|
24 |
"""Generate embeddings for a given text."""
|
25 |
+
if not text.strip():
|
26 |
+
return np.zeros(512) # Handle empty input gracefully
|
27 |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
28 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
29 |
with torch.no_grad():
|
|
|
32 |
|
33 |
# Load dataset
|
34 |
@st.cache_data
|
35 |
+
def load_data(_tokenizer, _model, _device):
|
36 |
dataset = load_dataset("frankjosh/filtered_dataset", split="train")
|
37 |
df = pd.DataFrame(dataset).head(500) # Limit to 500 repositories
|
38 |
|
39 |
+
# Fill missing values to avoid errors
|
40 |
+
df['docstring'] = df.get('docstring', "").fillna("")
|
41 |
+
df['summary'] = df.get('summary', "").fillna("")
|
42 |
+
|
43 |
# Generate embeddings for each row
|
44 |
def compute_embedding(row):
|
45 |
+
text = f"{row['docstring']} {row['summary']}"
|
46 |
+
return generate_embedding(text, _tokenizer, _model, _device)
|
47 |
|
48 |
df['embedding'] = df.apply(compute_embedding, axis=1)
|
49 |
return df
|
|
|
52 |
"""Fetch README file from GitHub repository."""
|
53 |
try:
|
54 |
readme_url = repo_url.rstrip("/") + "/blob/main/README.md"
|
55 |
+
response = requests.get(readme_url, timeout=10)
|
56 |
if response.status_code == 200:
|
57 |
return response.text
|
58 |
else:
|
59 |
return "README not available."
|
60 |
+
except requests.exceptions.RequestException as e:
|
61 |
return f"Error fetching README: {e}"
|
62 |
|
63 |
# Main application logic
|
|
|
67 |
|
68 |
# Load resources
|
69 |
tokenizer, model, device = load_model()
|
70 |
+
|
71 |
+
with st.spinner("Loading dataset and generating embeddings. This may take a moment..."):
|
72 |
+
try:
|
73 |
+
data = load_data(tokenizer, model, device)
|
74 |
+
except Exception as e:
|
75 |
+
st.error(f"Error loading dataset: {e}")
|
76 |
+
return
|
77 |
|
78 |
# Input user query
|
79 |
user_query = st.text_input("Describe your project or learning goal:",
|
80 |
"I am working on a project to recommend music using pandas and numpy.")
|
81 |
if user_query:
|
82 |
+
with st.spinner("Processing your query..."):
|
83 |
+
query_embedding = generate_embedding(user_query, tokenizer, model, device)
|
84 |
+
|
85 |
+
# Compute similarity
|
86 |
+
try:
|
87 |
+
data['similarity'] = data['embedding'].apply(
|
88 |
+
lambda emb: cosine_similarity([query_embedding], [np.array(emb)])[0][0]
|
89 |
+
)
|
90 |
+
|
91 |
+
# Filter and sort recommendations
|
92 |
+
top_recommendations = (
|
93 |
+
data.sort_values(by='similarity', ascending=False)
|
94 |
+
.head(5)
|
95 |
+
)
|
96 |
+
|
97 |
+
# Display recommendations
|
98 |
+
st.subheader("Top Recommendations")
|
99 |
+
for idx, row in top_recommendations.iterrows():
|
100 |
+
st.markdown(f"### {row['repo']}")
|
101 |
+
st.write(f"**Path:** {row['path']}")
|
102 |
+
st.write(f"**Summary:** {row['summary']}")
|
103 |
+
st.write(f"**Similarity Score:** {row['similarity']:.2f}")
|
104 |
+
st.markdown(f"[Repository Link]({row['url']})")
|
105 |
+
|
106 |
+
# Fetch and display README
|
107 |
+
st.subheader("Repository README")
|
108 |
+
readme_content = fetch_readme(row['url'])
|
109 |
+
st.code(readme_content)
|
110 |
+
|
111 |
+
except Exception as e:
|
112 |
+
st.error(f"Error computing recommendations: {e}")
|
113 |
|
114 |
if __name__ == "__main__":
|
115 |
main()
|