Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import logging
|
4 |
+
import gradio as gr
|
5 |
+
from pinecone import Pinecone, ServerlessSpec
|
6 |
+
from langchain_pinecone import PineconeVectorStore
|
7 |
+
from langchain_community.document_loaders import TextLoader
|
8 |
+
from langchain.text_splitter import CharacterTextSplitter
|
9 |
+
from langchain.chains import RetrievalQA
|
10 |
+
from langchain_community.llms import OpenAI
|
11 |
+
from langchain_openai import OpenAIEmbeddings
|
12 |
+
|
13 |
+
# --- Logging ---
|
14 |
+
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
15 |
+
|
16 |
+
# --- Environment Variables ---
|
17 |
+
api_key = os.getenv("PINECONE_API_KEY")
|
18 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
19 |
+
|
20 |
+
if not api_key:
|
21 |
+
raise ValueError("Please set the PINECONE_API_KEY as an environment variable.")
|
22 |
+
if not openai_api_key:
|
23 |
+
raise ValueError("Please set the OPENAI_API_KEY as an environment variable.")
|
24 |
+
os.environ["OPENAI_API_KEY"] = openai_api_key
|
25 |
+
|
26 |
+
# --- Pinecone Setup ---
|
27 |
+
index_name = "quickstart"
|
28 |
+
dimension = 1536
|
29 |
+
pc = Pinecone(api_key=api_key)
|
30 |
+
|
31 |
+
# Create index if not exists
|
32 |
+
if index_name not in [idx['name'] for idx in pc.list_indexes()]:
|
33 |
+
pc.create_index(
|
34 |
+
name=index_name,
|
35 |
+
dimension=dimension,
|
36 |
+
metric="euclidean",
|
37 |
+
spec=ServerlessSpec(cloud="aws", region="us-east-1")
|
38 |
+
)
|
39 |
+
|
40 |
+
# --- Load and Process Document ---
|
41 |
+
os.makedirs("data/paul_graham", exist_ok=True)
|
42 |
+
file_path = "data/paul_graham/paul_graham_essay.txt"
|
43 |
+
if not os.path.exists(file_path):
|
44 |
+
import requests
|
45 |
+
url = "https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt"
|
46 |
+
r = requests.get(url)
|
47 |
+
with open(file_path, "w") as f:
|
48 |
+
f.write(r.text)
|
49 |
+
|
50 |
+
loader = TextLoader(file_path)
|
51 |
+
documents = loader.load()
|
52 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
53 |
+
texts = text_splitter.split_documents(documents)
|
54 |
+
|
55 |
+
# --- Embedding and Vector Store ---
|
56 |
+
embeddings = OpenAIEmbeddings()
|
57 |
+
docsearch = PineconeVectorStore.from_documents(texts, embeddings, index_name=index_name)
|
58 |
+
|
59 |
+
# --- Query Engine Setup ---
|
60 |
+
llm = OpenAI()
|
61 |
+
retriever = docsearch.as_retriever()
|
62 |
+
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
|
63 |
+
|
64 |
+
# --- Query Function ---
|
65 |
+
def ask_question(prompt):
|
66 |
+
try:
|
67 |
+
response = qa.run(prompt)
|
68 |
+
return str(response)
|
69 |
+
except Exception as e:
|
70 |
+
return f"❌ Error: {str(e)}"
|
71 |
+
|
72 |
+
# --- Gradio UI ---
|
73 |
+
with gr.Blocks(css="""body { background-color: #f5f5dc; font-family: 'Georgia', 'Merriweather', serif;}h1, h2, h3 { color: #4e342e;}.gr-box, .gr-column, .gr-group { border-radius: 15px; padding: 20px; background-color: #fffaf0; box-shadow: 2px 4px 14px rgba(0, 0, 0, 0.1); margin-top: 10px;}textarea, input[type="text"] { background-color: #fffaf0; border: 1px solid #d2b48c; color: #4e342e; border-radius: 8px;}button { background-color: #a1887f; color: white; font-weight: bold; border-radius: 8px; transition: background-color 0.3s ease;}button:hover { background-color: #8d6e63;}.gr-button { border-radius: 8px !important;}""") as demo:
|
74 |
+
with gr.Column():
|
75 |
+
gr.Markdown("""
|
76 |
+
<div style='text-align: center;'>
|
77 |
+
<h1>🧠 Paul Graham Essay Q&A</h1>
|
78 |
+
<div style='font-size: 1.1em; color: #6d4c41; margin-bottom: 1em;'>
|
79 |
+
Explore insights from Paul Graham's essay using semantic search powered by <strong>LangChain</strong> + <strong>Pinecone</strong>.
|
80 |
+
</div>
|
81 |
+
</div>
|
82 |
+
""")
|
83 |
+
with gr.Accordion("ℹ️ What is Pinecone Vector Indexing?", open=False):
|
84 |
+
gr.Markdown("""**Pinecone** is a vector database that stores document embeddings (numeric representations of meaning). When you ask a question, it's converted into a vector and compared against stored vectors to find the most relevant answers — even if they don't match word-for-word.""")
|
85 |
+
gr.Markdown("### 📖 Ask your question below:")
|
86 |
+
with gr.Group():
|
87 |
+
with gr.Row():
|
88 |
+
user_input = gr.Textbox(
|
89 |
+
placeholder="E.g., What does Paul Graham say about startups?",
|
90 |
+
label="Your Question",
|
91 |
+
lines=2
|
92 |
+
)
|
93 |
+
with gr.Row():
|
94 |
+
output = gr.Textbox(label="Answer", lines=6)
|
95 |
+
with gr.Row():
|
96 |
+
submit_btn = gr.Button("🔍 Search Essay")
|
97 |
+
clear_btn = gr.Button("🧹 Clear")
|
98 |
+
submit_btn.click(fn=ask_question, inputs=user_input, outputs=output)
|
99 |
+
clear_btn.click(fn=lambda: ("", ""), inputs=None, outputs=[user_input, output])
|
100 |
+
|
101 |
+
demo.launch()
|