Spaces:
Sleeping
Sleeping
import os | |
import re | |
import shutil | |
import time | |
from pathlib import Path | |
from datetime import date | |
from cleantext import clean | |
from doctr.io import DocumentFile | |
from doctr.models import ocr_predictor | |
from spellchecker import SpellChecker | |
import nltk | |
nltk.data.path.append('/home/user/nltk_data') | |
nltk.download('punkt') | |
nltk.download('punkt_tab') | |
class Preprocessor: | |
"""Clase para preprocesar texto, realizar limpieza y correcciones.""" | |
def __init__(self): | |
self.spell_checker = SpellChecker() | |
def clean_text(text: str, lower: bool = False, lang: str = "en") -> str: | |
""" | |
Limpia texto de ruido y caracteres no deseados. | |
""" | |
return clean( | |
text, | |
fix_unicode=True, | |
to_ascii=True, | |
lower=lower, | |
no_line_breaks=True, | |
no_urls=True, | |
no_emails=True, | |
no_phone_numbers=True, | |
no_numbers=False, | |
no_digits=False, | |
no_currency_symbols=True, | |
no_punct=False, | |
lang=lang, | |
) | |
def correct_spacing(text: str, exceptions=None) -> str: | |
""" | |
Corrige espacios alrededor de signos de puntuaci贸n y excepciones. | |
""" | |
if exceptions is None: | |
exceptions = ["e.g.", "i.e.", "etc.", "cf.", "vs.", "p."] | |
text = re.sub(r"\s+", " ", text) | |
text = re.sub(r'\s([?.!"](?:\s|$))', r"\1", text) | |
text = re.sub(r"\s,", r",", text) | |
for exception in exceptions: | |
text = text.replace(" ".join(exception.split()), exception) | |
return text.strip() | |
def split_into_sentences(text: str) -> list: | |
""" | |
Divide texto en oraciones usando NLTK. | |
""" | |
from nltk.tokenize import sent_tokenize | |
return sent_tokenize(text) | |
def correct_spelling(self, text: str) -> str: | |
""" | |
Corrige la ortograf铆a del texto dado. | |
""" | |
words = text.split() | |
corrected_words = [self.spell_checker.correction(word) for word in words] | |
return " ".join(corrected_words) | |
def preprocess_text(self, text: str) -> str: | |
""" | |
Limpia, corrige ortograf铆a y ajusta espacios en texto. | |
""" | |
cleaned = self.clean_text(text) | |
corrected = self.correct_spelling(cleaned) | |
return self.correct_spacing(corrected) | |
def clean_sentences(self, sentences: list) -> list: | |
""" | |
Limpia cada oraci贸n en una lista de oraciones. | |
""" | |
return [self.clean_text(sentence) for sentence in sentences] | |
class PDFProcessor: | |
"""Clase para procesar archivos PDF y convertirlos a texto.""" | |
def __init__(self, max_pages=20): | |
self.ocr_model = ocr_predictor(pretrained=True) | |
self.max_pages = max_pages | |
def pdf_to_text(self, file_path: str) -> str: | |
""" | |
Convierte un archivo PDF a texto usando OCR. | |
""" | |
pdf_file = Path(file_path) | |
doc = DocumentFile.from_pdf(pdf_file) | |
# Aseg煤rate de que `doc` sea un objeto compatible con pages | |
if isinstance(doc, list): | |
pages = doc[:self.max_pages] if len(doc) > self.max_pages else doc | |
elif hasattr(doc, "pages"): | |
pages = doc.pages[:self.max_pages] if len(doc.pages) > self.max_pages else doc.pages | |
else: | |
raise ValueError("Formato inesperado para el documento PDF.") | |
raw_text = "\n".join( | |
[block.text for page in pages for block in page.blocks] | |
) | |
return Preprocessor().preprocess_text(raw_text) | |
class FileHandler: | |
"""Clase para manejar archivos temporales y limpieza.""" | |
def save_temp_file(file_obj, temp_dir: Path = None) -> str: | |
""" | |
Guarda un archivo temporalmente y retorna su ruta. | |
""" | |
if temp_dir is None: | |
temp_dir = Path("temp") | |
temp_dir.mkdir(exist_ok=True) | |
file_path = Path(file_obj.name) | |
temp_path = temp_dir / file_path.name | |
with open(temp_path, "wb") as f: | |
f.write(file_obj.read()) | |
return str(temp_path.resolve()) | |
def clear_temp_files(directory="temp", name_contains="RESULT_"): | |
""" | |
Limpia archivos temporales en el directorio especificado. | |
""" | |
temp_dir = Path(directory) | |
if not temp_dir.exists(): | |
return | |
for file in temp_dir.iterdir(): | |
if file.is_file() and name_contains in file.name: | |
file.unlink() | |
def move_to_completed(from_dir: Path, filename: str, completed_dir="completed"): | |
""" | |
Mueve un archivo procesado a la carpeta 'completed'. | |
""" | |
completed_path = from_dir / completed_dir | |
completed_path.mkdir(exist_ok=True) | |
shutil.move(from_dir / filename, completed_path / filename) | |