Spaces:
Running
Running
File size: 11,469 Bytes
4993b07 fd1323d 479eeac 4993b07 5ad665c 92b1f74 7382bd4 7fffc01 53833ab 3b59a51 b618a75 b6aee82 b618a75 6c5699f 983f039 6c5699f a55659f 6c5699f a55659f 6c5699f 4993b07 6c5699f 983f039 6c5699f 983f039 6c5699f 4993b07 6c5699f 4993b07 6c5699f 983f039 6c5699f 983f039 8c91e59 fd1323d 49ecf68 fd1323d 49ecf68 fd1323d 49ecf68 8c91e59 fd1323d 49ecf68 fd1323d 49ecf68 fd1323d 49ecf68 82258a9 49ecf68 82258a9 49ecf68 6ce06d2 4993b07 f94f8e4 4993b07 53833ab de66deb 4993b07 53833ab b65f639 4993b07 53833ab b65f639 53833ab b65f639 4993b07 b65f639 4993b07 1d07d89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 |
from docx import Document
import json
import datetime
import tempfile
from pathlib import Path
from unidecode import unidecode
from langchain_community.document_loaders import JSONLoader, UnstructuredWordDocumentLoader, WebBaseLoader, AsyncHtmlLoader
from langchain_community.document_transformers import Html2TextTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
import google.generativeai as genai
from tqdm import tqdm
from pathlib import Path
import shutil
import requests
from bs4 import BeautifulSoup
import os
from langchain_docling import DoclingLoader#, ExportType
from langchain_docling.loader import ExportType
import logging
# logging.getLogger("langchain").setLevel(logging.ERROR)
logging.getLogger().setLevel(logging.ERROR)
# from file_loader import get_vectorstore
key = os.environ["GOOGLE_API_KEY"]
# import asyncio
# from urllib.parse import urljoin
# from playwright.async_api import async_playwright
# from langchain_community.document_loaders import AsyncHtmlLoader
# from langchain_community.document_transformers import Html2TextTransformer
# from tqdm.asyncio import tqdm
# async def _fetch_urls(base_url):
# """Extract all links from a JavaScript-rendered webpage."""
# async with async_playwright() as p:
# try:
# browser = await p.chromium.launch(headless=True)
# page = await browser.new_page()
# await page.goto(base_url)
# await page.wait_for_load_state("networkidle")
# urls = set()
# links = await page.locator("a").all()
# for link in links:
# href = await link.get_attribute("href")
# if href and "#" not in href:
# full_url = urljoin(base_url, href)
# if full_url.startswith(base_url):
# urls.add(full_url)
# await browser.close()
# except Exception as e:
# print(f"⚠️ Không thể truy cập {base_url}: {e}")
# return [] # Trả về danh sách rỗng nếu gặp lỗi
# return list(urls)
# async def _fetch_web_content(urls):
# """Fetch HTML content and convert it to text, with a progress bar."""
# docs = []
# progress_bar = tqdm(total=len(urls), desc="Scraping Pages", unit="page")
# for page_url in urls:
# try:
# loader = AsyncHtmlLoader(page_url)
# html2text = Html2TextTransformer()
# html = await loader.aload()
# doc = html2text.transform_documents(html)
# docs.extend(doc)
# except Exception as e:
# print(f"Error loading {page_url}: {e}")
# progress_bar.update(1) # Update progress bar
# progress_bar.close()
# return docs
# def scrape_website(base_urls):
# """
# Scrapes a list of base URLs and extracts their content.
# Includes a progress bar for tracking.
# """
# async def _main():
# all_urls = []
# for base_url in base_urls:
# urls = await _fetch_urls(base_url)
# all_urls.extend(urls)
# docs = await _fetch_web_content(all_urls)
# return docs
# return asyncio.run(_main)
# class ChunkerWrapper:
# def __init__(self, splitter):
# self.splitter = splitter
# def chunk(self, text):
# # Use the 'split_text' method of the splitter to divide the text
# return self.splitter.split_text(text)
# def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
# """Tải nội dung từ danh sách URL với thanh tiến trình"""
# docs = []
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
# chunker = ChunkerWrapper(text_splitter)
# for page_url in tqdm(base_urls, desc="Đang tải trang", unit="url"):
# try:
# # loader = WebBaseLoader(page_url)
# loader = DoclingLoader(file_path=page_url,chunker=chunker # This will break your doc into manageable pieces.
# )
# html = loader.load()
# doc = html
# docs.extend(doc)
# except Exception as e:
# print(f"Lỗi khi tải {page_url}: {e}")
# print(f"Tải thành công {len(docs)} trang.")
# return docs
# def load_text_data(file_path):
# """Tải nội dung văn bản từ file DOCX (đã loại bảng)."""
# # cleaned_file = Document(file_path) #remove_tables_from_docx(file_path)
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
# chunker = ChunkerWrapper(text_splitter)
# return DoclingLoader(file_path=file_path, chunker=chunker # This will break your doc into manageable pieces.
# ).load()
def get_web_documents(base_urls=['https://nct.neu.edu.vn/']):
"""Fetch content from a list of URLs with a progress bar."""
docs = []
for page_url in tqdm(base_urls, desc="Loading page", unit="url"):
try:
loader = DoclingLoader(
file_path=page_url,
export_type=ExportType.DOC_CHUNKS # Enable internal chunking
)
doc = loader.load()
docs.extend(doc)
except Exception as e:
print(f"Error loading {page_url}: {e}")
print(f"Successfully loaded {len(docs)} documents.")
return docs
def load_text_data(file_path):
"""Load text content from a DOCX file (tables removed)."""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
loader = DoclingLoader(
file_path=file_path,
export_type=ExportType.MARKDOWN, # Enable internal chunking,
chunker = text_splitter
)
return loader.load()
def log_message(messages, filename="chat_log.txt"):
"""Ghi lịch sử tin nhắn vào file log"""
with open(filename, "a", encoding="utf-8") as f:
log_entry = {
"timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"conversation": messages
}
f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
def remove_tables_from_docx(file_path):
"""Tạo bản sao của file DOCX nhưng loại bỏ tất cả bảng bên trong."""
doc = Document(file_path)
new_doc = Document()
for para in doc.paragraphs:
new_doc.add_paragraph(para.text)
# 📌 Lưu vào file tạm, đảm bảo đóng đúng cách
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_file:
temp_path = temp_file.name
new_doc.save(temp_path)
return temp_path # ✅ Trả về đường dẫn file mới, không làm hỏng file gốc
def extract_tables_from_docx(file_path):
doc = Document(file_path)
tables = []
all_paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()] # Lấy tất cả đoạn văn bản không rỗng
table_index = 0
para_index = 0
table_positions = []
# Xác định vị trí của bảng trong tài liệu
for element in doc.element.body:
if element.tag.endswith("tbl"):
table_positions.append((table_index, para_index))
table_index += 1
elif element.tag.endswith("p"):
para_index += 1
for idx, (table_idx, para_idx) in enumerate(table_positions):
data = []
for row in doc.tables[table_idx].rows:
data.append([cell.text.strip() for cell in row.cells])
if len(data) > 1: # Chỉ lấy bảng có dữ liệu
# Lấy 5 dòng trước và sau bảng
related_start = max(0, para_idx - 5)
related_end = min(len(all_paragraphs), para_idx + 5)
related_text = all_paragraphs[related_start:related_end]
tables.append({"table": idx + 1, "content": data, "related": related_text})
return tables
def convert_to_json(tables):
structured_data = {}
for table in tables:
headers = [unidecode(h) for h in table["content"][0]] # Bỏ dấu ở headers
rows = [[unidecode(cell) for cell in row] for row in table["content"][1:]] # Bỏ dấu ở dữ liệu
json_table = [dict(zip(headers, row)) for row in rows if len(row) == len(headers)]
related_text = [unidecode(text) for text in table["related"]] # Bỏ dấu ở văn bản liên quan
structured_data[table["table"]] = {
"content": json_table,
"related": related_text
}
return json.dumps(structured_data, indent=4, ensure_ascii=False)
def save_json_to_file(json_data, output_path):
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(json.loads(json_data), f, ensure_ascii=False, indent=4)
# def load_json_with_langchain(json_path):
# loader = JSONLoader(file_path=json_path, jq_schema='.. | .content?', text_content=False)
# data = loader.load()
# # # Kiểm tra xem dữ liệu có bị lỗi không
# # print("Sample Data:", data[:2]) # In thử 2 dòng đầu
# return data
def load_json_manually(json_path):
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def load_table_data(file_path, output_json_path):
tables = extract_tables_from_docx(file_path)
json_output = convert_to_json(tables)
save_json_to_file(json_output, output_json_path)
table_data = load_json_manually(output_json_path)
return table_data
def get_splits(file_path, output_json_path):
# table_data = load_table_data(file_path, output_json_path)
text_data = load_text_data(file_path)
# Chia nhỏ văn bản
# json_splitter = RecursiveJsonSplitter(max_chunk_size = 1000)
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=250)
# table_splits = json_splitter.create_documents(texts=[table_data])
# text_splits = text_splitter.split_documents(text_data)
# all_splits = table_splits + text_splits DoclingLoader
return text_data #text_splits
def get_json_splits_only(file_path):
table_data = load_json_manually(file_path)
def remove_accents(obj): #xoa dau tieng viet
if isinstance(obj, str):
return unidecode(obj)
elif isinstance(obj, list):
return [remove_accents(item) for item in obj]
elif isinstance(obj, dict):
return {remove_accents(k): remove_accents(v) for k, v in obj.items()}
return obj
cleaned_data = remove_accents(table_data)
wrapped_data = {"data": cleaned_data} if isinstance(cleaned_data, list) else cleaned_data
json_splitter = RecursiveJsonSplitter(max_chunk_size = 2000)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
table_splits = json_splitter.create_documents(texts=[wrapped_data])
table_splits = text_splitter.split_documents(table_splits)
return table_splits
def list_docx_files(folder_path):
return [str(file) for file in Path(folder_path).rglob("*.docx")]
def prompt_order(queries):
text = 'IMPORTANT: Here is the questions of user in order, use that and the context above to know the best answer:\n'
i = 0
for q in queries:
i += 1
text += f'Question {i}: {str(q)}\n'
return text |