3v324v23 commited on
Commit
27f04d2
Β·
1 Parent(s): 11a81be

Deploy: pre-built index

Browse files
Files changed (3) hide show
  1. README.md +5 -6
  2. app.py +335 -0
  3. requirements.txt +8 -0
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
- title: SloganAI NewAsaf
3
- emoji: πŸ‘€
4
- colorFrom: indigo
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.43.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: SloganAI-Fixed
3
+ emoji: 🏷️
4
+ colorFrom: yellow
5
+ colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.43.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
+ # SloganAI - Fixed
 
app.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os, re
3
+ import numpy as np
4
+ import pandas as pd
5
+ import gradio as gr
6
+ import faiss
7
+ import torch
8
+ from typing import List
9
+ from sentence_transformers import SentenceTransformer, CrossEncoder
10
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
11
+
12
+ # ---- Config ----
13
+ FLAN_PRIMARY = os.getenv("FLAN_PRIMARY", "google/flan-t5-base")
14
+ EMBED_NAME = "sentence-transformers/all-mpnet-base-v2"
15
+ RERANK_NAME = "cross-encoder/stsb-roberta-base"
16
+ NUM_SLOGAN_SAMPLES = int(os.getenv("NUM_SLOGAN_SAMPLES", "16"))
17
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ ASSETS_DIR = "assets"
19
+
20
+ # ---- Lazy models ----
21
+ _GEN_TOK = None
22
+ _GEN_MODEL = None
23
+ _EMBED_MODEL = None
24
+ _RERANKER = None
25
+
26
+ def _ensure_models():
27
+ global _GEN_TOK, _GEN_MODEL, _EMBED_MODEL, _RERANKER
28
+ if _EMBED_MODEL is None:
29
+ _EMBED_MODEL = SentenceTransformer(EMBED_NAME)
30
+ if _RERANKER is None:
31
+ _RERANKER = CrossEncoder(RERANK_NAME)
32
+ if _GEN_MODEL is None:
33
+ tok = AutoTokenizer.from_pretrained(FLAN_PRIMARY)
34
+ mdl = AutoModelForSeq2SeqLM.from_pretrained(FLAN_PRIMARY)
35
+ _GEN_TOK, _GEN_MODEL = tok, mdl.to(DEVICE)
36
+ print(f"[INFO] Loaded generator: {FLAN_PRIMARY}")
37
+
38
+ # ---- Data & PRE-BUILT FAISS from assets folder ----
39
+ _DATA_DF = None
40
+ _INDEX = None
41
+ _EMBEDDINGS = None
42
+
43
+ def _ensure_index():
44
+ global _DATA_DF, _INDEX, _EMBEDDINGS
45
+ if _INDEX is not None:
46
+ return
47
+
48
+ # Load assets from the assets directory
49
+ try:
50
+ data_path = os.path.join(ASSETS_DIR, "data.parquet")
51
+ index_path = os.path.join(ASSETS_DIR, "faiss.index")
52
+ emb_path = os.path.join(ASSETS_DIR, "embeddings.npy")
53
+
54
+ _DATA_DF = pd.read_parquet(data_path)
55
+ _INDEX = faiss.read_index(index_path)
56
+ _EMBEDDINGS = np.load(emb_path)
57
+
58
+ print(f"[INFO] Loaded pre-built FAISS index. rows={len(_DATA_DF)}, dim={_INDEX.d}")
59
+
60
+ except FileNotFoundError:
61
+ print("[ERROR] Pre-built assets not found. The space may fail to run.")
62
+ print("[INFO] Falling back to building a tiny demo index.")
63
+ _DATA_DF = pd.DataFrame({
64
+ "name": ["HowDidIDo", "Museotainment", "Movitr"],
65
+ "tagline": ["Online evaluation platform", "PacMan & Louvre meet", "Crowdsourced video translation"],
66
+ "description": [
67
+ "Public speaking, Presentation skills and interview practice",
68
+ "Interactive AR museum tours",
69
+ "Video translation with voice and subtitles"
70
+ ]
71
+ })
72
+ _ensure_models()
73
+ vecs = _EMBED_MODEL.encode(_DATA_DF["description"].astype(str).tolist(), normalize_embeddings=True).astype(np.float32)
74
+ _INDEX = faiss.IndexFlatIP(vecs.shape[1])
75
+ _INDEX.add(vecs)
76
+
77
+ def recommend(query_text: str, top_k: int = 3) -> pd.DataFrame:
78
+ _ensure_index()
79
+ _ensure_models() # Make sure the embedder is ready
80
+ q_vec = _EMBED_MODEL.encode([query_text], normalize_embeddings=True).astype("float32")
81
+ scores, idxs = _INDEX.search(q_vec, top_k)
82
+ out = _DATA_DF.iloc[idxs[0]].copy()
83
+ out["score"] = scores[0]
84
+ return out[["name","tagline","description","score"]]
85
+
86
+ # ---- Refined v2 slogan generator (unchanged logic) ----
87
+ BLOCK_PATTERNS = [
88
+ r"^[A-Z][a-z]+ [A-Z][a-z]+ (Platform|Solution|System|Application|Marketplace)$",
89
+ r"^[A-Z][a-z]+ [A-Z][a-z]+$",
90
+ r"^[A-Z][a-z]+$",
91
+ ]
92
+ HARD_BLOCK_WORDS = {
93
+ "platform","solution","system","application","marketplace",
94
+ "ai-powered","ai powered","empower","empowering",
95
+ "artificial intelligence","machine learning","augmented reality","virtual reality",
96
+ }
97
+ GENERIC_WORDS = {"app","assistant","smart","ai","ml","ar","vr","decentralized","blockchain"}
98
+ MARKETING_VERBS = {"build","grow","simplify","discover","create","connect","transform","unlock","boost","learn","move","clarify"}
99
+ BENEFIT_WORDS = {"faster","smarter","easier","better","safer","clearer","stronger","together","confidently","simply","instantly"}
100
+ GOOD_SLOGANS_TO_AVOID_DUP = {
101
+ "smarter care, faster decisions",
102
+ "checkout built for small brands",
103
+ "less guessing. more healing.",
104
+ "built to grow with your cart.",
105
+ "stand tall. feel better.",
106
+ "train your brain to win.",
107
+ "your body. your algorithm.",
108
+ "play smarter. grow brighter.",
109
+ "style that thinks with you."
110
+ }
111
+
112
+ def _tokens(s: str) -> List[str]:
113
+ return re.findall(r"[a-z0-9]{3,}", s.lower())
114
+
115
+ def _jaccard(a: List[str], b: List[str]) -> float:
116
+ A, B = set(a), set(b)
117
+ return 0.0 if not A or not B else len(A & B) / len(A | B)
118
+
119
+ def _titlecase_soft(s: str) -> str:
120
+ out = []
121
+ for w in s.split():
122
+ out.append(w if w.isupper() else w.capitalize())
123
+ return " ".join(out)
124
+
125
+ def _is_blocked_slogan(s: str) -> bool:
126
+ if not s: return True
127
+ s_strip = s.strip()
128
+ for pat in BLOCK_PATTERNS:
129
+ if re.match(pat, s_strip):
130
+ return True
131
+ s_low = s_strip.lower()
132
+ for w in HARD_BLOCK_WORDS:
133
+ if w in s_low:
134
+ return True
135
+ if s_low in GOOD_SLOGANS_TO_AVOID_DUP:
136
+ return True
137
+ return False
138
+
139
+ def _generic_penalty(s: str) -> float:
140
+ hits = sum(1 for w in GENERIC_WORDS if w in s.lower())
141
+ return min(1.0, 0.25 * hits)
142
+
143
+ def _for_penalty(s: str) -> float:
144
+ return 0.3 if re.search(r"\bfor\b", s.lower()) else 0.0
145
+
146
+ def _neighbor_context(neighbors_df: pd.DataFrame) -> str:
147
+ if neighbors_df is None or neighbors_df.empty:
148
+ return ""
149
+ examples = []
150
+ for _, row in neighbors_df.head(3).iterrows():
151
+ tg = str(row.get("tagline", "")).strip()
152
+ if 5 <= len(tg) <= 70:
153
+ examples.append(f"- {tg}")
154
+ return "\n".join(examples)
155
+
156
+ def _copies_neighbor(s: str, neighbors_df: pd.DataFrame) -> bool:
157
+ if neighbors_df is None or neighbors_df.empty:
158
+ return False
159
+ s_low = s.lower()
160
+ s_toks = _tokens(s_low)
161
+ for _, row in neighbors_df.iterrows():
162
+ t = str(row.get("tagline", "")).strip()
163
+ if not t:
164
+ continue
165
+ t_low = t.lower()
166
+ if s_low == t_low:
167
+ return True
168
+ if _jaccard(s_toks, _tokens(t_low)) >= 0.7:
169
+ return True
170
+ try:
171
+ _ensure_models() # Make sure the embedder is ready
172
+ s_vec = _EMBED_MODEL.encode([s])[0]; s_vec = s_vec / np.linalg.norm(s_vec)
173
+ for _, row in neighbors_df.head(3).iterrows():
174
+ t = str(row.get("tagline", "")).strip()
175
+ if not t: continue
176
+ t_vec = _EMBED_MODEL.encode([t])[0]; t_vec = t_vec / np.linalg.norm(t_vec)
177
+ if float(np.dot(s_vec, t_vec)) >= 0.85:
178
+ return True
179
+ except Exception:
180
+ pass
181
+ return False
182
+
183
+ def _clean_slogan(text: str, max_words: int = 8) -> str:
184
+ text = text.strip().split("\n")[0]
185
+ text = re.sub(r"[\"β€œβ€β€˜β€™]", "", text)
186
+ text = re.sub(r"\s+", " ", text).strip()
187
+ text = re.sub(r"^\W+|\W+$", "", text)
188
+ words = text.split()
189
+ if len(words) > max_words:
190
+ text = " ".join(words[:max_words])
191
+ return text
192
+
193
+ def _score_candidates(query: str, cands: List[str], neighbors_df: pd.DataFrame) -> List[tuple]:
194
+ if not cands:
195
+ return []
196
+ _ensure_models() # Make sure the cross-encoder is ready
197
+ ce_scores = np.asarray(_RERANKER.predict([(query, s) for s in cands]), dtype=np.float32) / 5.0
198
+ q_toks = _tokens(query)
199
+ results = []
200
+
201
+ neighbor_vecs = []
202
+ if neighbors_df is not None and not neighbors_df.empty:
203
+ _ensure_models() # Make sure the embedder is ready
204
+ for _, row in neighbors_df.head(3).iterrows():
205
+ t = str(row.get("tagline","")).strip()
206
+ if t:
207
+ v = _EMBED_MODEL.encode([t])[0]
208
+ neighbor_vecs.append(v / np.linalg.norm(v))
209
+
210
+ for i, s in enumerate(cands):
211
+ words = s.split()
212
+ brevity = 1.0 - min(1.0, abs(len(words) - 5) / 5.0)
213
+ wl = set(w.lower() for w in words)
214
+ m_hits = len(wl & MARKETING_VERBS)
215
+ b_hits = len(wl & BENEFIT_WORDS)
216
+ marketing = min(1.0, 0.2*m_hits + 0.2*b_hits)
217
+ g_pen = _generic_penalty(s)
218
+ f_pen = _for_penalty(s)
219
+
220
+ n_pen = 0.0
221
+ if neighbor_vecs:
222
+ try:
223
+ _ensure_models() # Make sure the embedder is ready
224
+ s_vec = _EMBED_MODEL.encode([s])[0]; s_vec = s_vec / np.linalg.norm(s_vec)
225
+ sim_max = max(float(np.dot(s_vec, nv)) for nv in neighbor_vecs) if neighbor_vecs else 0.0
226
+ n_pen = sim_max
227
+ except Exception:
228
+ n_pen = 0.0
229
+
230
+ overlap = _jaccard(q_toks, _tokens(s))
231
+ anti_copy = 1.0 - overlap
232
+
233
+ score = (
234
+ 0.55*float(ce_scores[i]) +
235
+ 0.20*brevity +
236
+ 0.15*marketing +
237
+ 0.03*anti_copy -
238
+ 0.07*g_pen -
239
+ 0.03*f_pen -
240
+ 0.10*n_pen
241
+ )
242
+ results.append((s, float(score)))
243
+ return results
244
+
245
+ def generate_slogan(query_text: str, neighbors_df: pd.DataFrame = None, n_samples: int = NUM_SLOGAN_SAMPLES) -> str:
246
+ _ensure_models()
247
+ ctx = _neighbor_context(neighbors_df)
248
+ prompt = (
249
+ "You are a creative brand copywriter. Write short, original, memorable startup slogans (max 8 words).\n"
250
+ "Forbidden words: app, assistant, platform, solution, system, marketplace, AI, machine learning, augmented reality, virtual reality, decentralized, empower.\n"
251
+ "Focus on clear benefits and vivid verbs. Do not copy the description. Return ONLY a list, one slogan per line.\n\n"
252
+ "Good Examples:\n"
253
+ "Description: AI assistant for doctors to prioritize patient cases\n"
254
+ "Slogan: Less Guessing. More Healing.\n\n"
255
+ "Description: Payments for small online stores\n"
256
+ "Slogan: Built to Grow with Your Cart.\n\n"
257
+ "Description: Neurotech headset to boost focus\n"
258
+ "Slogan: Train Your Brain to Win.\n\n"
259
+ "Description: Interior design suggestions with AI\n"
260
+ "Slogan: Style That Thinks With You.\n\n"
261
+ "Bad Examples (avoid these): Innovative AI Platform / Smart App for Everyone / Empowering Small Businesses\n\n"
262
+ )
263
+ if ctx:
264
+ prompt += f"Similar taglines (style only):\n{ctx}\n\n"
265
+ prompt += f"Description: {query_text}\nSlogans:"
266
+
267
+ input_ids = _GEN_TOK(prompt, return_tensors="pt").input_ids.to(DEVICE)
268
+ outputs = _GEN_MODEL.generate(
269
+ input_ids,
270
+ max_new_tokens=24,
271
+ do_sample=True,
272
+ top_k=60,
273
+ top_p=0.92,
274
+ temperature=1.2,
275
+ num_return_sequences=n_samples,
276
+ repetition_penalty=1.08
277
+ )
278
+ raw_cands = [_GEN_TOK.decode(o, skip_special_tokens=True) for o in outputs]
279
+
280
+ cand_set = set()
281
+ for txt in raw_cands:
282
+ for line in txt.split("\n"):
283
+ s = _clean_slogan(line)
284
+ if not s:
285
+ continue
286
+ if len(s.split()) < 2 or len(s.split()) > 8:
287
+ continue
288
+ if _is_blocked_slogan(s):
289
+ continue
290
+ if _copies_neighbor(s, neighbors_df):
291
+ continue
292
+ cand_set.add(_titlecase_soft(s))
293
+
294
+ if not cand_set:
295
+ return _clean_slogan(_GEN_TOK.decode(outputs[0], skip_special_tokens=True))
296
+
297
+ scored = _score_candidates(query_text, sorted(cand_set), neighbors_df)
298
+ if not scored:
299
+ return _clean_slogan(_GEN_TOK.decode(outputs[0], skip_special_tokens=True))
300
+
301
+ scored.sort(key=lambda x: x[1], reverse=True)
302
+ return scored[0][0]
303
+
304
+ # ---- Gradio UI ----
305
+ EXAMPLES = [
306
+ "AI coach for improving public speaking skills",
307
+ "Augmented reality app for interactive museum tours",
308
+ "Voice-controlled task manager for remote teams",
309
+ "Machine learning system for predicting crop yields",
310
+ "Platform for AI-assisted interior design suggestions",
311
+ ]
312
+
313
+ def pipeline(user_input: str):
314
+ recs = recommend(user_input, top_k=3)
315
+ slogan = generate_slogan(user_input, neighbors_df=recs, n_samples=NUM_SLOGAN_SAMPLES)
316
+ recs = recs.reset_index(drop=True)
317
+ recs.loc[len(recs)] = {"name":"Synthetic Example","tagline":slogan,"description":user_input,"score":np.nan}
318
+ return recs[["name","tagline","description","score"]], slogan
319
+
320
+ with gr.Blocks(title="SloganAI β€” Recommendations + Slogan Generator") as demo:
321
+ gr.Markdown("## SloganAI β€” Top-3 Recommendations + A High-Quality Generated Slogan")
322
+ with gr.Row():
323
+ with gr.Column(scale=1):
324
+ inp = gr.Textbox(label="Enter a startup description", lines=3, placeholder="e.g., AI coach for improving public speaking skills")
325
+ gr.Examples(EXAMPLES, inputs=inp, label="One-click examples")
326
+ btn = gr.Button("Submit", variant="primary")
327
+ with gr.Column(scale=2):
328
+ out_df = gr.Dataframe(headers=["Name","Tagline","Description","Score"], label="Top 3 + Generated")
329
+ out_sg = gr.Textbox(label="Generated Slogan", interactive=False)
330
+ btn.click(fn=pipeline, inputs=inp, outputs=[out_df, out_sg])
331
+
332
+ if __name__ == "__main__":
333
+ _ensure_models()
334
+ _ensure_index()
335
+ demo.queue().launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ sentence-transformers
4
+ faiss-cpu
5
+ pandas
6
+ numpy
7
+ torch
8
+ pyarrow