Spaces:
Runtime error
Runtime error
File size: 6,521 Bytes
d037cdf a4b89be d037cdf a4b89be d037cdf a4b89be d037cdf a4b89be d037cdf a4b89be d037cdf a4b89be d037cdf a4b89be d037cdf a4b89be d037cdf a4b89be d037cdf a4b89be d037cdf a4b89be d037cdf a4b89be d037cdf a4b89be d037cdf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import pinecone
import os
import PyPDF2
import gradio as gr
from tqdm import tqdm
from pydantic import Field
from langchain.load.serializable import Serializable
# from langchain.vectorstores import Pinecone
from custom_vectordb import Pinecone
from config import PINECONE_API_KEY, PINECONE_ENVIRONMENT, INDEX_NAME, CONNECTION_STRING, CONTAINER_NAME, NAME_SPACE_1, NAME_SPACE_2
from config import EMBEDDING_API_BASE, EMBEDDING_API_KEY, OPENAI_API_TYPE, OPENAI_API_VERSION, EMBEDDING_DEPLOYMENT_ID
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter
from azure.storage.blob import BlobServiceClient
# initialize pinecone
pinecone.init(
api_key=PINECONE_API_KEY, # find at app.pinecone.io
environment=PINECONE_ENVIRONMENT, # next to api key in console
)
# Azure embedding model definition
embeddings = OpenAIEmbeddings(
deployment=EMBEDDING_DEPLOYMENT_ID,
openai_api_key=EMBEDDING_API_KEY,
openai_api_base=EMBEDDING_API_BASE,
openai_api_type=OPENAI_API_TYPE,
openai_api_version=OPENAI_API_VERSION,
chunk_size=16
)
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=30)
if INDEX_NAME and INDEX_NAME not in pinecone.list_indexes():
pinecone.create_index(
INDEX_NAME,
metric="cosine",
dimension=1536
)
print(f"Index {INDEX_NAME} created successfully")
index = pinecone.Index(INDEX_NAME)
blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)
class Document(Serializable):
"""Class for storing a piece of text and associated metadata."""
page_content: str
"""String text."""
metadata: dict = Field(default_factory=dict)
"""Arbitrary metadata about the page content (e.g., source, relationships to other
documents, etc.).
"""
# def update_fb():
# with open('data.json') as json_file:
# data = json.load(json_file)
# datas = ast.literal_eval(data)
# texts = []
# for k, v in datas.items():
# content = v["content"].split("-----")[0] + "\nimage_link: " + str(v["image"])
# post_url = v["post_url"]
# texts.append(Document(page_content=content, metadata={"source": post_url}))
# if len(texts)>0:
# Pinecone.from_documents(texts, embeddings, index_name=INDEX_NAME, namespace=NAME_SPACE_2)
# message = f"Add facebook data to space {NAME_SPACE_2} in {INDEX_NAME} sucessfully"
# return message
def upload_files_blob(file_path):
file_name = os.path.basename(file_path)
blob_client = blob_service_client.get_blob_client(container=CONTAINER_NAME, blob=file_name)
with open(file_path,'rb') as data:
blob_client.upload_blob(data)
print(f"Uploaded {file_name}.")
def load_files_blob():
container_client = blob_service_client.get_container_client(CONTAINER_NAME)
files_name = []
for blob in container_client.list_blobs():
files_name.append(blob.name)
return files_name
def delete_blob(blob_name):
# Get container client
container_client = blob_service_client.get_container_client(CONTAINER_NAME)
container_client.delete_blob(blob_name)
print(f"Deleted {blob_name}")
def delete_all():
container_client = blob_service_client.get_container_client(CONTAINER_NAME)
blob_list = container_client.list_blobs()
for blob in blob_list:
container_client.delete_blob(blob.name)
index.delete(delete_all=True, namespace=NAME_SPACE_1)
message = f"Delete all files in space {NAME_SPACE_1} succesfully"
return gr.update(choices=[]), message, gr.Files.update(None)
def delete_file(files_src):
file_name = []
for files in files_src:
delete_blob(files)
file_name.append(files)
_filter = {"source": {"$in": file_name}}
index.delete(filter=_filter, namespace=NAME_SPACE_1)
message = f"Delete {len(files_src)} files in space {NAME_SPACE_1} files succesfully"
available_files = load_files_blob()
return gr.update(choices=available_files), message, gr.Files.update(None)
def upload_file(check_box):
if check_box:
namespace = NAME_SPACE_1
else:
namespace = NAME_SPACE_2
vectorstore = Pinecone.from_existing_index(INDEX_NAME, embeddings, namespace=namespace)
print(f"Load files from space {namespace} in {INDEX_NAME}")
return vectorstore
def handle_upload_file(files):
documents = get_documents(files)
if len(documents)>0:
Pinecone.from_documents(documents, embeddings, index_name=INDEX_NAME, namespace=NAME_SPACE_1)
message = f"Add files to space {NAME_SPACE_1} in {INDEX_NAME} sucessfully"
print(message)
else:
message = f"Load files from space existing {NAME_SPACE_1} in {INDEX_NAME}"
print(message)
return message
def get_documents(file_src):
documents = []
if file_src is None:
return documents
available_files = load_files_blob()
for file in file_src:
filepath = file.name
filename = os.path.basename(filepath)
file_type = os.path.splitext(filename)[1]
if filename in available_files:
continue
else:
upload_files_blob(filepath)
try:
if file_type == ".pdf":
pdftext = ""
with open(filepath, "rb") as pdfFileObj:
pdf_reader = PyPDF2.PdfReader(pdfFileObj)
for page in tqdm(pdf_reader.pages):
pdftext += page.extract_text()
texts = [Document(page_content=pdftext, metadata={"source": filename})]
elif file_type == ".docx":
from langchain.document_loaders import UnstructuredWordDocumentLoader
loader = UnstructuredWordDocumentLoader(filepath)
texts = loader.load()
elif file_type == ".pptx":
from langchain.document_loaders import UnstructuredPowerPointLoader
loader = UnstructuredPowerPointLoader(filepath)
texts = loader.load()
else:
from langchain.document_loaders import TextLoader
loader = TextLoader(filepath, "utf8")
texts = loader.load()
except Exception as e:
import traceback
traceback.print_exc()
texts = text_splitter.split_documents(texts)
documents.extend(texts)
return documents
if __name__ == "__main__":
upload_file(["STANDARD_SOFTWARE LIFECYCLES.pdf"]) |