Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,7 @@ import langchain
|
|
8 |
import time
|
9 |
from cnocr import CnOcr
|
10 |
import pinecone
|
11 |
-
|
12 |
from langchain.vectorstores import Pinecone
|
13 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
14 |
from langchain.text_splitter import CharacterTextSplitter
|
@@ -39,45 +39,7 @@ all_max_len = 3000
|
|
39 |
|
40 |
# Initialize Pinecone client and create an index
|
41 |
pinecone.init(api_key="ffb1f594-0915-4ebf-835f-c1eaa62fdcdc",environment = "us-west4-gcp-free")
|
42 |
-
index = pinecone.Index(index_name="test")
|
43 |
-
|
44 |
-
|
45 |
-
def pine(data):
|
46 |
-
char_text_spliter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap=0)
|
47 |
-
# doc_text = char_text_spliter.split_documents(data)
|
48 |
-
doc_spilt = []
|
49 |
-
data = data.split(" ")
|
50 |
-
# print(len(data))
|
51 |
-
|
52 |
-
c = 0
|
53 |
-
check = 0
|
54 |
-
for i in data:
|
55 |
-
# print(i)
|
56 |
-
if c == 350:
|
57 |
-
text = " ".join(data[check: check + c])
|
58 |
-
print(text)
|
59 |
-
print(check)
|
60 |
-
doc_spilt.append(text)
|
61 |
-
check = check + c
|
62 |
-
c = 0
|
63 |
-
else:
|
64 |
-
c = c+1
|
65 |
-
|
66 |
-
|
67 |
-
Embedding_model = "text-embedding-ada-002"
|
68 |
-
embeddings = OpenAIEmbeddings(openai_api_key="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")
|
69 |
-
|
70 |
-
pinecone.init(api_key = "ffb1f594-0915-4ebf-835f-c1eaa62fdcdc",
|
71 |
-
environment = "us-west4-gcp-free"
|
72 |
-
)
|
73 |
-
|
74 |
-
index_name = "test"
|
75 |
-
docstore = Pinecone.from_texts([d for d in doc_spilt],embeddings,index_name = index_name,namespace='a1')
|
76 |
-
|
77 |
-
return ''
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
|
82 |
|
83 |
def get_emb(text):
|
@@ -193,7 +155,7 @@ def up_file(fls):
|
|
193 |
|
194 |
#Pdf Extracting
|
195 |
for idx, file in enumerate(pdf):
|
196 |
-
print("
|
197 |
#print(file.name)
|
198 |
with pdfplumber.open(file) as pdf:
|
199 |
for i in range(len(pdf.pages)):
|
@@ -270,6 +232,39 @@ def up_file(fls):
|
|
270 |
value="Processing")
|
271 |
|
272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
|
274 |
|
275 |
|
|
|
8 |
import time
|
9 |
from cnocr import CnOcr
|
10 |
import pinecone
|
11 |
+
import openai
|
12 |
from langchain.vectorstores import Pinecone
|
13 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
14 |
from langchain.text_splitter import CharacterTextSplitter
|
|
|
39 |
|
40 |
# Initialize Pinecone client and create an index
|
41 |
pinecone.init(api_key="ffb1f594-0915-4ebf-835f-c1eaa62fdcdc",environment = "us-west4-gcp-free")
|
42 |
+
index = pinecone.Index(index_name="test")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
|
45 |
def get_emb(text):
|
|
|
155 |
|
156 |
#Pdf Extracting
|
157 |
for idx, file in enumerate(pdf):
|
158 |
+
print("11111")
|
159 |
#print(file.name)
|
160 |
with pdfplumber.open(file) as pdf:
|
161 |
for i in range(len(pdf.pages)):
|
|
|
232 |
value="Processing")
|
233 |
|
234 |
|
235 |
+
def pine(data):
|
236 |
+
char_text_spliter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap=0)
|
237 |
+
# doc_text = char_text_spliter.split_documents(data)
|
238 |
+
doc_spilt = []
|
239 |
+
data = data.split(" ")
|
240 |
+
# print(len(data))
|
241 |
+
|
242 |
+
c = 0
|
243 |
+
check = 0
|
244 |
+
for i in data:
|
245 |
+
# print(i)
|
246 |
+
if c == 350:
|
247 |
+
text = " ".join(data[check: check + c])
|
248 |
+
print(text)
|
249 |
+
print(check)
|
250 |
+
doc_spilt.append(text)
|
251 |
+
check = check + c
|
252 |
+
c = 0
|
253 |
+
else:
|
254 |
+
c = c+1
|
255 |
+
|
256 |
+
|
257 |
+
Embedding_model = "text-embedding-ada-002"
|
258 |
+
embeddings = OpenAIEmbeddings(openai_api_key="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")
|
259 |
+
|
260 |
+
pinecone.init(api_key = "ffb1f594-0915-4ebf-835f-c1eaa62fdcdc",
|
261 |
+
environment = "us-west4-gcp-free"
|
262 |
+
)
|
263 |
+
|
264 |
+
index_name = "test"
|
265 |
+
docstore = Pinecone.from_texts([d for d in doc_spilt],embeddings,index_name = index_name,namespace='a1')
|
266 |
+
|
267 |
+
return ''
|
268 |
|
269 |
|
270 |
|