Spaces:
Sleeping
Sleeping
| """ | |
| RAG引擎:实现传统RAG和GraphRAG的检索逻辑 | |
| """ | |
| from typing import List, Dict, Tuple | |
| # 优先使用轻量级版本(避免超过 Vercel 250MB 限制) | |
| try: | |
| from database_setup_lite import SimpleGraphDB, VectorDB | |
| except ImportError: | |
| from database_setup import SimpleGraphDB, VectorDB | |
| import json | |
| import requests | |
| # LLM配置(从环境变量读取,确保安全) | |
| import os | |
| LLM_API_BASE = os.getenv("LLM_API_BASE", "https://api.ai-gaochao.cn/v1") | |
| LLM_API_KEY = os.getenv("LLM_API_KEY", "") | |
| LLM_MODEL = os.getenv("LLM_MODEL", "gemini-2.5-flash") | |
| if not LLM_API_KEY: | |
| raise ValueError("LLM_API_KEY 环境变量未设置!请在 .env 文件中设置 LLM_API_KEY") | |
| class TraditionalRAG: | |
| """传统语义RAG""" | |
| def __init__(self, vector_db: VectorDB, graph_db: SimpleGraphDB = None): | |
| self.vector_db = vector_db | |
| self.graph_db = graph_db # 用于限制搜索范围 | |
| def retrieve(self, query: str, product_name: str = None, style_name: str = None, n_results: int = 5) -> Dict: | |
| """语义检索(传统RAG:直接向量搜索,不利用图结构,返回片段句子)""" | |
| # 传统RAG的特点:直接进行语义相似度搜索,不利用图结构 | |
| # 使用相同的文案数据库,但只返回相似的片段句子(而不是完整文案) | |
| # 直接进行向量搜索(传统RAG的特点) | |
| # 传统RAG限制结果数量,只返回最相关的2-3个结果 | |
| limited_results = min(3, n_results) # 最多返回3个结果 | |
| all_results = self.vector_db.search(query, n_results=limited_results * 2) # 多搜索一些,用于提取片段 | |
| # 从完整文案中提取与查询最相关的片段句子 | |
| processed_results = [] | |
| query_keywords = set(query.lower().split()) | |
| for result in all_results[:limited_results * 2]: | |
| full_content = result.get("content", "") | |
| if not full_content: | |
| continue | |
| # 将文案按句子分割(中文句号、英文句号、感叹号、问号) | |
| import re | |
| sentences = re.split(r'[。!?.!?]', full_content) | |
| sentences = [s.strip() for s in sentences if s.strip()] | |
| # 找到与查询最相关的句子片段 | |
| best_sentences = [] | |
| for sentence in sentences: | |
| # 计算句子与查询的相关度(简单关键词匹配) | |
| sentence_lower = sentence.lower() | |
| keyword_matches = sum(1 for keyword in query_keywords if keyword in sentence_lower) | |
| if keyword_matches > 0: | |
| best_sentences.append((sentence, keyword_matches)) | |
| # 按相关度排序,取前2-3个最相关的句子 | |
| best_sentences.sort(key=lambda x: x[1], reverse=True) | |
| selected_sentences = [s[0] for s in best_sentences[:3]] | |
| # 如果没有找到相关句子,取前3个句子作为片段 | |
| if not selected_sentences and sentences: | |
| selected_sentences = sentences[:3] | |
| # 组合成片段(最多150字,确保有足够内容) | |
| snippet = "。".join(selected_sentences) | |
| if not snippet and sentences: | |
| # 如果还是空的,至少取前3个句子 | |
| snippet = "。".join(sentences[:3]) | |
| if len(snippet) > 150: | |
| snippet = snippet[:150] + "..." | |
| elif len(snippet) < 30 and len(sentences) > 0: | |
| # 如果片段太短,至少取前2-3个句子 | |
| snippet = "。".join(sentences[:min(3, len(sentences))]) | |
| if len(snippet) > 150: | |
| snippet = snippet[:150] + "..." | |
| if snippet: | |
| processed_results.append({ | |
| "content": snippet, # 返回片段而不是完整文案 | |
| "full_content": full_content, # 保留完整内容用于显示 | |
| "metadata": result.get("metadata", {}), | |
| "distance": result.get("distance", 0), | |
| "is_snippet": True # 标记这是片段 | |
| }) | |
| if len(processed_results) >= limited_results: | |
| break | |
| # 如果结果太少,至少返回1-2个语义相似的结果 | |
| if len(processed_results) < 1: | |
| # 如果提取片段失败,至少返回一些结果 | |
| for result in all_results[:max(1, limited_results)]: | |
| content = result.get("content", "") | |
| if content: | |
| # 简单截取前150字作为片段 | |
| snippet = content[:150] + "..." if len(content) > 150 else content | |
| processed_results.append({ | |
| "content": snippet, | |
| "full_content": content, | |
| "metadata": result.get("metadata", {}), | |
| "distance": result.get("distance", 0), | |
| "is_snippet": True | |
| }) | |
| if len(processed_results) >= limited_results: | |
| break | |
| return { | |
| "method": "语义检索", | |
| "query": query, | |
| "product": product_name, | |
| "style": style_name, | |
| "results": processed_results[:limited_results], | |
| "retrieval_path": [ | |
| "向量相似度搜索(传统RAG:不利用图结构)", | |
| f"找到 {len(processed_results)} 个语义相似的片段", | |
| "⚠️ 局限性:只返回片段句子,没有图结构,无法找到跨品类的风格相关文案" | |
| ], | |
| "explanation": "传统RAG直接通过语义相似度搜索相关文案,使用相同的文案数据库,但只返回与查询最相关的片段句子(而不是完整文案)。没有图结构,无法找到跨品类的风格相关文案。" | |
| } | |
| class GraphRAG: | |
| """图增强RAG""" | |
| def __init__(self, graph_db: SimpleGraphDB, vector_db: VectorDB): | |
| self.graph_db = graph_db | |
| self.vector_db = vector_db | |
| def retrieve(self, query: str, product_name: str = None, style_name: str = None, n_results: int = 5) -> Dict: | |
| """图增强检索""" | |
| retrieval_path = [] | |
| retrieved_docs = [] | |
| # 步骤1: 尝试找到风格节点 | |
| style_node = None | |
| if style_name: | |
| style_node = self.graph_db.find_node_by_property("Style", "name", style_name) | |
| if style_node: | |
| retrieval_path.append(f"定位风格节点: {style_node['properties']['name']}") | |
| # 步骤2: 通过风格节点找到相关文案(跨品类) | |
| if style_node: | |
| # 反向查找:找到连接到风格的文案节点 | |
| for edge in self.graph_db.edges: | |
| if edge["target"] == style_node["id"] and edge["relationship"] == "HAS_STYLE": | |
| copy_node = self.graph_db.nodes.get(edge["source"]) | |
| if copy_node and copy_node["type"] == "Copywriting": | |
| content = copy_node["properties"]["content"] | |
| # 获取该文案关联的产品(HAS_COPY关系:Product -> Copywriting) | |
| product_id = None | |
| for e in self.graph_db.edges: | |
| if e["target"] == edge["source"] and e["relationship"] == "HAS_COPY": | |
| product_id = e["source"] | |
| break | |
| product_info = self.graph_db.nodes.get(product_id, {}).get("properties", {}) | |
| retrieved_docs.append({ | |
| "content": content, | |
| "source": "图遍历", | |
| "product": product_info.get("name", "未知"), | |
| "style": style_name, | |
| "tag": copy_node["properties"].get("tag", ""), | |
| "retrieval_reason": f"通过风格节点'{style_name}'找到的跨品类文案(来自产品:{product_info.get('name', '未知')})" | |
| }) | |
| if retrieved_docs: | |
| retrieval_path.append(f"通过风格节点遍历找到 {len(retrieved_docs)} 个相关文案") | |
| else: | |
| retrieval_path.append("未找到该风格的相关文案") | |
| # 步骤3: 如果指定了产品,查找产品特征 | |
| product_features = [] | |
| if product_name: | |
| product_node = self.graph_db.find_node_by_property("Product", "name", product_name) | |
| if product_node: | |
| retrieval_path.append(f"定位产品节点: {product_name}") | |
| features = product_node["properties"].get("features", []) | |
| keywords = product_node["properties"].get("keywords", []) | |
| product_features = features + keywords | |
| retrieval_path.append(f"提取产品特征: {', '.join(product_features[:5])}") | |
| # 步骤4: 如果图检索结果不足,用向量检索补充 | |
| if len(retrieved_docs) < n_results: | |
| vector_results = self.vector_db.search(query, n_results=n_results - len(retrieved_docs)) | |
| for result in vector_results: | |
| # 避免重复 | |
| if not any(doc["content"] == result["content"] for doc in retrieved_docs): | |
| retrieved_docs.append({ | |
| "content": result["content"], | |
| "source": "向量检索补充", | |
| "product": result["metadata"].get("product_id", "未知"), | |
| "style": result["metadata"].get("style_id", "未知"), | |
| "tag": result["metadata"].get("tag", ""), | |
| "retrieval_reason": "语义相似度补充检索" | |
| }) | |
| if vector_results: | |
| retrieval_path.append(f"向量检索补充 {len(vector_results)} 个结果") | |
| return { | |
| "method": "图增强检索", | |
| "query": query, | |
| "product": product_name, | |
| "style": style_name, | |
| "product_features": product_features, | |
| "results": retrieved_docs[:n_results], | |
| "retrieval_path": retrieval_path, | |
| "explanation": "通过图结构找到跨品类的风格相关文案,即使产品不同,但风格相通,可以借鉴文案模板。" | |
| } | |
| class RAGEngine: | |
| """RAG引擎主类""" | |
| def __init__(self, graph_db: SimpleGraphDB, vector_db: VectorDB): | |
| self.graph_db = graph_db | |
| self.traditional_rag = TraditionalRAG(vector_db, graph_db) | |
| self.graph_rag = GraphRAG(graph_db, vector_db) | |
| def compare_retrieval(self, query: str, product_name: str = None, style_name: str = None) -> Dict: | |
| """对比传统RAG和GraphRAG的检索结果""" | |
| traditional_result = self.traditional_rag.retrieve(query, product_name, style_name) | |
| graph_result = self.graph_rag.retrieve(query, product_name, style_name) | |
| return { | |
| "traditional_rag": traditional_result, | |
| "graph_rag": graph_result, | |
| "comparison": { | |
| "traditional_count": len(traditional_result["results"]), | |
| "graph_count": len(graph_result["results"]), | |
| "graph_cross_category": len([r for r in graph_result["results"] if r.get("source") == "图遍历"]) | |
| } | |
| } | |
| def generate_copywriting(self, query: str, product_name: str, style_name: str, use_graph: bool = True) -> Dict: | |
| """生成文案(使用LLM)""" | |
| if use_graph: | |
| retrieval_result = self.graph_rag.retrieve(query, product_name, style_name) | |
| else: | |
| retrieval_result = self.traditional_rag.retrieve(query, product_name, style_name) | |
| # 获取检索到的参考文案 | |
| retrieved_texts = [r["content"] for r in retrieval_result["results"][:5]] # 取前5个作为参考 | |
| # 统计信息 | |
| cross_category_count = len([r for r in retrieval_result["results"] if r.get("source") == "图遍历"]) if use_graph else 0 | |
| # 获取产品特征(用于GraphRAG) | |
| product_features = [] | |
| if use_graph and retrieval_result.get("product_features"): | |
| product_features = retrieval_result["product_features"] | |
| # 调用LLM生成文案 | |
| try: | |
| llm_generated = self._call_llm_generate( | |
| product_name=product_name, | |
| style_name=style_name, | |
| reference_texts=retrieved_texts, | |
| product_features=product_features, | |
| use_graph=use_graph, | |
| cross_category_count=cross_category_count | |
| ) | |
| except Exception as e: | |
| print(f"LLM生成失败: {e}") | |
| # 如果LLM失败,使用模板生成 | |
| llm_generated = self._generate_template(retrieved_texts, product_name, style_name) | |
| # 组装最终输出 | |
| if use_graph and product_features: | |
| features = ", ".join(product_features[:3]) | |
| reference_sources = ', '.join([r.get('product', '未知') for r in retrieval_result["results"][:3]]) | |
| generated_text = f"""基于图增强检索生成的文案: | |
| ✨ 检索策略:通过图结构找到跨品类的风格相关文案 | |
| 📊 检索结果:找到 {len(retrieved_texts)} 个相关文案,其中 {cross_category_count} 个来自跨品类(通过风格节点关联) | |
| 🎯 产品特征:{features} | |
| 📝 参考文案来源:{reference_sources} | |
| 【{style_name}风格】{product_name}文案: | |
| {llm_generated} | |
| 💡 说明:GraphRAG 通过风格节点找到了跨品类的参考文案(如香薰蜡烛的清冷避世风文案),即使产品不同,但风格相通,可以借鉴文案模板。""" | |
| else: | |
| generated_text = f"""基于传统语义检索生成的文案: | |
| 🔍 检索策略:直接通过语义相似度搜索 | |
| 📊 检索结果:找到 {len(retrieved_texts)} 个语义相似的文案 | |
| ⚠️ 局限性:如果数据库中没有相似内容,可能返回不相关的结果 | |
| 【{style_name}风格】{product_name}文案: | |
| {llm_generated} | |
| 💡 说明:传统 RAG 只能找到语义相似的文案,如果数据库中没有该产品的该风格文案,可能无法生成合适的文案。""" | |
| return { | |
| "generated_text": generated_text, | |
| "retrieval_result": retrieval_result, | |
| "method": "GraphRAG" if use_graph else "Traditional RAG" | |
| } | |
| def _call_llm_generate(self, product_name: str, style_name: str, reference_texts: List[str], | |
| product_features: List[str] = None, use_graph: bool = True, | |
| cross_category_count: int = 0) -> str: | |
| """调用LLM生成文案""" | |
| headers = { | |
| "Content-Type": "application/json", | |
| "Authorization": f"Bearer {LLM_API_KEY}" | |
| } | |
| url = f"{LLM_API_BASE}/chat/completions" | |
| # 构建参考文案说明 | |
| reference_context = "" | |
| if reference_texts: | |
| reference_context = "\n\n参考文案(用于学习风格和句式):\n" | |
| for i, text in enumerate(reference_texts[:3], 1): | |
| reference_context += f"{i}. {text}\n" | |
| else: | |
| reference_context = "\n\n⚠️ 注意:没有找到相关参考文案,请根据产品特征和风格要求创作。" | |
| # 构建产品特征说明 | |
| features_context = "" | |
| if product_features: | |
| features_context = f"\n产品特征:{', '.join(product_features[:5])}" | |
| # 构建prompt | |
| if use_graph and cross_category_count > 0: | |
| prompt = f"""你是一名擅长小红书文案写作的创意编辑。请根据以下信息,生成一篇适合在小红书发布的文案(200-300字,要求内容丰富、有细节感)。 | |
| 产品名称:{product_name} | |
| 目标风格:{style_name} | |
| {features_context} | |
| {reference_context} | |
| 重要提示: | |
| 1. 这些参考文案来自其他产品(跨品类),但风格相同,请学习它们的句式、语气和情感表达方式 | |
| 2. 将参考文案的风格和句式应用到目标产品上 | |
| 3. 文案要有细节感、人情味,符合小红书用户的阅读习惯 | |
| 4. 保持{style_name}的风格特征 | |
| 5. 文案长度要求200-300字,要有丰富的内容和细节描述,可以包含使用场景、情感体验、产品特色等多个方面 | |
| 6. 请确保文案完整,不要被截断,以完整的句子结尾 | |
| 请直接输出文案内容,不要包含"好的"、"没问题"等前缀,也不要使用markdown格式。只输出文案正文,确保内容完整。""" | |
| else: | |
| prompt = f"""你是一名擅长小红书文案写作的创意编辑。请根据以下信息,生成一篇适合在小红书发布的文案(200-300字,要求内容丰富、有细节感)。 | |
| 产品名称:{product_name} | |
| 目标风格:{style_name} | |
| {features_context} | |
| {reference_context} | |
| 重要提示: | |
| 1. 参考文案可能有限或不够相关,请根据产品特征和风格要求创作 | |
| 2. 文案要有细节感、人情味,符合小红书用户的阅读习惯 | |
| 3. 保持{style_name}的风格特征 | |
| 4. 文案长度要求200-300字,要有丰富的内容和细节描述,可以包含使用场景、情感体验、产品特色等多个方面 | |
| 5. 请确保文案完整,不要被截断,以完整的句子结尾 | |
| 请直接输出文案内容,不要包含"好的"、"没问题"等前缀,也不要使用markdown格式。只输出文案正文,确保内容完整。""" | |
| body = { | |
| "model": LLM_MODEL, | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": "你是一名擅长文案写作的创意编辑,擅长创作小红书风格的文案。" | |
| }, | |
| { | |
| "role": "user", | |
| "content": prompt | |
| } | |
| ], | |
| "max_tokens": 4000, # 增加token限制以支持更长的文案(200-300字约需要800-1200 tokens,设置4000确保完整输出) | |
| "temperature": 0.9 | |
| } | |
| resp = requests.post(url, headers=headers, json=body, timeout=60) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| generated = data["choices"][0]["message"]["content"].strip() | |
| # 清理生成的内容 | |
| # 移除常见的前缀(只移除开头的前缀,不要截断内容) | |
| prefixes_to_remove = [ | |
| "好的,没问题!", | |
| "好的,", | |
| "没问题!", | |
| "好的!", | |
| ] | |
| for prefix in prefixes_to_remove: | |
| if generated.startswith(prefix): | |
| generated = generated[len(prefix):].strip() | |
| # 移除markdown格式符号(但保留内容) | |
| generated = generated.replace("**", "").replace("*", "").strip() | |
| return generated | |
| def _generate_template(self, reference_texts: List[str], product_name: str, style_name: str) -> str: | |
| """生成文案模板(简化版,实际应调用LLM)""" | |
| # 如果有参考文案,提取关键句式 | |
| key_phrases = [] | |
| if reference_texts: | |
| for text in reference_texts[:2]: # 只取前2个参考 | |
| # 提取关键句式(简单提取) | |
| if "避难所" in text: | |
| key_phrases.append("避难所") | |
| if "安静" in text: | |
| key_phrases.append("安静") | |
| if "唯一" in text: | |
| key_phrases.append("唯一") | |
| if "绝绝子" in text: | |
| key_phrases.append("绝绝子") | |
| # 根据风格和产品生成 | |
| if "清冷避世风" in style_name or "深夜emo" in style_name.lower(): | |
| if "眼罩" in product_name: | |
| if key_phrases: | |
| # GraphRAG:使用参考文案的句式 | |
| return f"戴上眼罩的这片刻漆黑,是我在繁杂城市里唯一的{'避难所' if '避难所' in key_phrases else '避风港'}。物理意义上的关灯,也是心理上的断联。世界终于{'安静了' if '安静' in key_phrases else '静下来了'},今晚只属于我自己。" | |
| else: | |
| # 传统RAG:没有参考,使用通用模板 | |
| return f"这个{product_name}真的很不错,遮光效果好,推荐给大家使用。" | |
| elif "CCD" in product_name or "相机" in product_name: | |
| return "深夜拿起它,在颗粒感的画面里,所有的情绪都有了出口。低像素不是缺陷,是另一种真实。" | |
| else: | |
| if key_phrases: | |
| return f"每一个与{product_name}的瞬间,都是我与世界的{'唯一连接' if '唯一' in key_phrases else '连接'}。" | |
| else: | |
| return f"这个{product_name}真的很不错,推荐给大家。" | |
| elif "疯狂种草" in style_name: | |
| if key_phrases and "绝绝子" in key_phrases: | |
| # GraphRAG:使用参考文案的语气 | |
| return f"家人们谁懂啊!这个{product_name}真的绝绝子,一秒沦陷!必须人手一个!" | |
| else: | |
| # 传统RAG:没有参考,使用通用语气 | |
| return f"这个{product_name}真的很不错,推荐给大家购买!" | |
| else: | |
| if key_phrases: | |
| return f"这个{product_name}真的很不错,{'强烈推荐' if '绝绝子' in key_phrases else '推荐'}给大家!" | |
| else: | |
| return f"这个{product_name}真的很不错,推荐给大家!" | |