frankjosh commited on
Commit
5d7e380
·
verified ·
1 Parent(s): bdd7b82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -32
app.py CHANGED
@@ -22,6 +22,8 @@ def load_model():
22
 
23
  def generate_embedding(text, tokenizer, model, device):
24
  """Generate embeddings for a given text."""
 
 
25
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
26
  inputs = {k: v.to(device) for k, v in inputs.items()}
27
  with torch.no_grad():
@@ -30,14 +32,18 @@ def generate_embedding(text, tokenizer, model, device):
30
 
31
  # Load dataset
32
  @st.cache_data
33
- def load_data(tokenizer, model, device):
34
  dataset = load_dataset("frankjosh/filtered_dataset", split="train")
35
  df = pd.DataFrame(dataset).head(500) # Limit to 500 repositories
36
 
 
 
 
 
37
  # Generate embeddings for each row
38
  def compute_embedding(row):
39
- text = f"{row['docstring']} {row['summary']}" if 'docstring' in row and 'summary' in row else ""
40
- return generate_embedding(text, tokenizer, model, device)
41
 
42
  df['embedding'] = df.apply(compute_embedding, axis=1)
43
  return df
@@ -46,12 +52,12 @@ def fetch_readme(repo_url):
46
  """Fetch README file from GitHub repository."""
47
  try:
48
  readme_url = repo_url.rstrip("/") + "/blob/main/README.md"
49
- response = requests.get(readme_url)
50
  if response.status_code == 200:
51
  return response.text
52
  else:
53
  return "README not available."
54
- except Exception as e:
55
  return f"Error fetching README: {e}"
56
 
57
  # Main application logic
@@ -61,38 +67,49 @@ def main():
61
 
62
  # Load resources
63
  tokenizer, model, device = load_model()
64
- data = load_data(tokenizer, model, device)
 
 
 
 
 
 
65
 
66
  # Input user query
67
  user_query = st.text_input("Describe your project or learning goal:",
68
  "I am working on a project to recommend music using pandas and numpy.")
69
  if user_query:
70
- query_embedding = generate_embedding(user_query, tokenizer, model, device)
71
-
72
- # Compute similarity
73
- data['similarity'] = data['embedding'].apply(
74
- lambda emb: cosine_similarity([query_embedding], [np.array(emb)])[0][0]
75
- )
76
-
77
- # Filter and sort recommendations
78
- top_recommendations = (
79
- data.sort_values(by='similarity', ascending=False)
80
- .head(5)
81
- )
82
-
83
- # Display recommendations
84
- st.subheader("Top Recommendations")
85
- for idx, row in top_recommendations.iterrows():
86
- st.markdown(f"### {row['repo']}")
87
- st.write(f"**Path:** {row['path']}")
88
- st.write(f"**Summary:** {row['summary']}")
89
- st.write(f"**Similarity Score:** {row['similarity']:.2f}")
90
- st.markdown(f"[Repository Link]({row['url']})")
91
-
92
- # Fetch and display README
93
- st.subheader("Repository README")
94
- readme_content = fetch_readme(row['url'])
95
- st.code(readme_content)
 
 
 
 
 
96
 
97
  if __name__ == "__main__":
98
  main()
 
22
 
23
  def generate_embedding(text, tokenizer, model, device):
24
  """Generate embeddings for a given text."""
25
+ if not text.strip():
26
+ return np.zeros(512) # Handle empty input gracefully
27
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
28
  inputs = {k: v.to(device) for k, v in inputs.items()}
29
  with torch.no_grad():
 
32
 
33
  # Load dataset
34
  @st.cache_data
35
+ def load_data(_tokenizer, _model, _device):
36
  dataset = load_dataset("frankjosh/filtered_dataset", split="train")
37
  df = pd.DataFrame(dataset).head(500) # Limit to 500 repositories
38
 
39
+ # Fill missing values to avoid errors
40
+ df['docstring'] = df.get('docstring', "").fillna("")
41
+ df['summary'] = df.get('summary', "").fillna("")
42
+
43
  # Generate embeddings for each row
44
  def compute_embedding(row):
45
+ text = f"{row['docstring']} {row['summary']}"
46
+ return generate_embedding(text, _tokenizer, _model, _device)
47
 
48
  df['embedding'] = df.apply(compute_embedding, axis=1)
49
  return df
 
52
  """Fetch README file from GitHub repository."""
53
  try:
54
  readme_url = repo_url.rstrip("/") + "/blob/main/README.md"
55
+ response = requests.get(readme_url, timeout=10)
56
  if response.status_code == 200:
57
  return response.text
58
  else:
59
  return "README not available."
60
+ except requests.exceptions.RequestException as e:
61
  return f"Error fetching README: {e}"
62
 
63
  # Main application logic
 
67
 
68
  # Load resources
69
  tokenizer, model, device = load_model()
70
+
71
+ with st.spinner("Loading dataset and generating embeddings. This may take a moment..."):
72
+ try:
73
+ data = load_data(tokenizer, model, device)
74
+ except Exception as e:
75
+ st.error(f"Error loading dataset: {e}")
76
+ return
77
 
78
  # Input user query
79
  user_query = st.text_input("Describe your project or learning goal:",
80
  "I am working on a project to recommend music using pandas and numpy.")
81
  if user_query:
82
+ with st.spinner("Processing your query..."):
83
+ query_embedding = generate_embedding(user_query, tokenizer, model, device)
84
+
85
+ # Compute similarity
86
+ try:
87
+ data['similarity'] = data['embedding'].apply(
88
+ lambda emb: cosine_similarity([query_embedding], [np.array(emb)])[0][0]
89
+ )
90
+
91
+ # Filter and sort recommendations
92
+ top_recommendations = (
93
+ data.sort_values(by='similarity', ascending=False)
94
+ .head(5)
95
+ )
96
+
97
+ # Display recommendations
98
+ st.subheader("Top Recommendations")
99
+ for idx, row in top_recommendations.iterrows():
100
+ st.markdown(f"### {row['repo']}")
101
+ st.write(f"**Path:** {row['path']}")
102
+ st.write(f"**Summary:** {row['summary']}")
103
+ st.write(f"**Similarity Score:** {row['similarity']:.2f}")
104
+ st.markdown(f"[Repository Link]({row['url']})")
105
+
106
+ # Fetch and display README
107
+ st.subheader("Repository README")
108
+ readme_content = fetch_readme(row['url'])
109
+ st.code(readme_content)
110
+
111
+ except Exception as e:
112
+ st.error(f"Error computing recommendations: {e}")
113
 
114
  if __name__ == "__main__":
115
  main()