Spaces:
Sleeping
Sleeping
| import chromadb | |
| from chromadb.utils import embedding_functions | |
| from .models import Param | |
| import os | |
| import torch | |
| from openai import OpenAI | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from itertools import islice, zip_longest | |
| import re | |
| # model_id = "mistralai/Mistral-7B-Instruct-v0.2" | |
| model_id = "Yugo60-GPT-GGUF.Q4_K_M.gguf" | |
| #outputs = model.generate(**inputs, max_new_tokens=20) | |
| #print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
| if not torch.backends.mps.is_available(): | |
| if not torch.backends.mps.is_built(): | |
| print("MPS not available because the current PyTorch install was not " | |
| "built with MPS enabled.") | |
| else: | |
| print("MPS not available because the current MacOS version is not 12.3+ " | |
| "and/or you do not have an MPS-enabled device on this machine.") | |
| else: | |
| torch.set_default_device("mps") | |
| model = "" | |
| CHROMA_DATA_PATH = "/Users/zoranpopovic/uchat/chroma_data/" | |
| EMBED_MODEL = "all-MiniLM-L6-v2" | |
| # NousResearch/Hermes-2-Pro-Mistral-7B | |
| # distilbert-base-multilingual-case | |
| # paraphrase-multilingual-MiniLM-L12-v2d | |
| COLLECTION_NAME = "chroma_data" | |
| PDF_PATH = "./PDF/uputstvo_uz_eon_smart_box-1.pdf" | |
| PDF_PATH2 = "./PDF/uputstvo_uz_eon_smart_aplikaciju-1.pdf" | |
| CHUNK_SIZE = 800 | |
| CHUNK_OVERLAP = 50 | |
| max_results = 3 | |
| min_len = 40 | |
| min_distance = 0.35 | |
| max_distance = 0.6 | |
| temperature = 0.55 | |
| max_tokens=3072 | |
| top_p=0.8 | |
| frequency_penalty=0.0 | |
| presence_penalty=0.15 | |
| DEBUG = True | |
| system_sr = "Zoveš se U-Chat AI asistent i pomažeš korisniku usluga kompanije United Group. Korisnik postavlja pitanje ili problem, upareno sa dodatnima saznanjima. Na osnovu toga napiši korisniku kratak i ljubazan odgovor koji kompletira njegov zahtev ili mu daje odgovor na pitanje. " | |
| # " Ako ne znaš odgovor, reci da ne znaš, ne izmišljaj ga." | |
| system_sr += "Usluge kompanije United Group uključuju i kablovsku mrežu za digitalnu televiziju, pristup internetu, uređaj EON SMART BOX za TV sadržaj, kao i fiksnu telefoniju." | |
| system = {'srpski': system_sr, 'hrvatski': "", 'slovenački': "", 'makedonski': ""} | |
| ctxpre = "" | |
| msg_content = {'srpski': "- Dodatna saznanja su: ", 'hrvatski': "", 'slovenački': "", 'makedonski': ""} | |
| max_conv = 3 | |
| try: | |
| edit_all = Param.objects.all() | |
| for edit in edit_all: | |
| system[edit.jezik] = edit.system | |
| ctxpre = edit.ctxpre | |
| msg_content[edit.jezik] = edit.msg_content | |
| min_len = edit.min_len | |
| CHUNK_SIZE = edit.CHUNK_SIZE | |
| CHUNK_OVERLAP = edit.CHUNK_OVERLAP | |
| max_results = edit.max_results | |
| EMBED_MODEL = edit.EMBED_MODEL | |
| model_id = edit.model_id | |
| min_distance = edit.min_distance | |
| max_distance = edit.max_distance | |
| max_conv = edit.max_conv | |
| temperature = edit.temperature | |
| top_p = edit.top_p | |
| max_tokens = edit.max_tokens | |
| presence_penalty = edit.presence_penalty | |
| frequency_penalty = edit.frequency_penalty | |
| DEBUG = edit.DEBUG | |
| except: | |
| pass | |
| def load_and_split_document(pdf_path): | |
| loader = PyPDFLoader(pdf_path) | |
| print('Loaded: ' + pdf_path) | |
| return loader.load_and_split() | |
| def split_text_into_chunks(pages, chunk_size, chunk_overlap): | |
| n = -1 | |
| for page in range(len(pages)): pages[page].page_content = re.sub(r'\s+'," ", pages[page].page_content.replace(". .","").replace(r'\n','.')).replace('..','') | |
| for p in range(len(pages)): | |
| if len(pages[p].page_content)<min_len: | |
| if n<0: n = p | |
| else: | |
| if n>=0: | |
| pages[n]=pages[p]; n += 1 | |
| if n>0: pages = pages[:n-1] | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
| return text_splitter.split_documents(pages) | |
| def batched(iterable, n): | |
| it = iter(iterable) | |
| while True: | |
| batch = list(islice(it, n)) | |
| if not batch: | |
| return | |
| yield batch | |
| #client = chromadb.PersistentClient(path=CHROMA_DATA_PATH) | |
| #client.allow_reset = True | |
| #client.delete_collection(COLLECTION_NAME) | |
| oc = OpenAI(base_url="http://localhost:4891/v1", api_key="not-needed") | |
| chroma_client = chromadb.PersistentClient(CHROMA_DATA_PATH) | |
| embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction( | |
| model_name=EMBED_MODEL | |
| ) | |
| collection = chroma_client.get_or_create_collection( | |
| name="chroma_data", | |
| embedding_function=embedding_func, | |
| metadata={"hnsw:space": "cosine"}, | |
| ) | |
| last = collection.count() | |
| def update_collection(docs, last, jezik): | |
| state = -2 | |
| used =[] | |
| for g in docs[0::2]: | |
| state += 2 | |
| documents=docs[state+1][0] | |
| bot.uchat.collection.add( | |
| documents=documents, | |
| ids=[f"id{last+i}" for i in range(len(documents))], | |
| metadatas=[{"state": g, "next": g, "used": False, "source": 'None', "page": -1, "lang": jezik } for i in range(len(documents)) ] | |
| ) | |
| last += len(documents) | |
| if (len(docs[state+1])>1): | |
| for n in docs[state+1][1:]: | |
| bot.uchat.collection.add( | |
| documents=n[1:], | |
| ids=[f"id{last+i-1}" for i in range(1,len(n))], | |
| metadatas=[{"state": g, "next": n[0], "used": False, "source": 'None', "page": -1, "lang": jezik } for i in range(1,len(n)) ] | |
| ) | |
| for i in range(1,len(n)): used += [0] | |
| last += len(n)-1 | |
| return last | |
| #docus = load_and_split_document(PDF_PATH) + load_and_split_document(PDF_PATH2) | |
| def load_docs(path, jezik): | |
| docus = load_and_split_document(path) | |
| pages = split_text_into_chunks(docus, CHUNK_SIZE, CHUNK_OVERLAP) | |
| document_indices = list(range(bot.uchat.last, bot.uchat.last+len(pages))) | |
| for batch in batched(document_indices, 66): | |
| bot.uchat.collection.add( | |
| ids=[f"id{last+batch[i]}" for i in range(len(batch))], | |
| documents=[pages[i].page_content for i in batch], | |
| metadatas=[dict(dict(dict(dict(pages[i].metadata, used=False), next='None'), state='None'), lang=jezik) for i in batch], | |
| ) | |
| last += len(batch) | |
| return last | |