Spaces:
Runtime error
Runtime error
app.py
Browse files
app.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import nltk
|
3 |
+
from nltk.corpus import stopwords, cess_esp, conll2002
|
4 |
+
from nltk.tokenize import word_tokenize
|
5 |
+
import stylecloud
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
from fpdf import FPDF
|
8 |
+
import re
|
9 |
+
from collections import Counter
|
10 |
+
import spacy
|
11 |
+
import os
|
12 |
+
import random
|
13 |
+
import string
|
14 |
+
import csv
|
15 |
+
|
16 |
+
# Descargar recursos necesarios de nltk
|
17 |
+
nltk.download('punkt')
|
18 |
+
nltk.download('stopwords')
|
19 |
+
nltk.download('cess_esp')
|
20 |
+
nltk.download('conll2002')
|
21 |
+
|
22 |
+
# Cargar el modelo de spaCy para español
|
23 |
+
nlp = spacy.load('es_core_news_md')
|
24 |
+
|
25 |
+
# Lista de stopwords adicional
|
26 |
+
additional_stopwords = [
|
27 |
+
# Aquí va tu lista de stopwords adicional...
|
28 |
+
]
|
29 |
+
|
30 |
+
# Función de preprocesamiento
|
31 |
+
def preprocess_text(text):
|
32 |
+
text = text.lower()
|
33 |
+
text = re.sub(r'\W', ' ', text)
|
34 |
+
doc = nlp(text)
|
35 |
+
stop_words = set(stopwords.words('spanish')).union(set(additional_stopwords))
|
36 |
+
filtered_words = []
|
37 |
+
for token in doc:
|
38 |
+
if token.text not in stop_words:
|
39 |
+
if token.pos_ in ['VERB', 'ADJ', 'NOUN']: # Considerar verbos, adjetivos y sustantivos
|
40 |
+
filtered_words.append(token.lemma_)
|
41 |
+
elif token.pos_ == 'NUM': # Ignorar números
|
42 |
+
continue
|
43 |
+
else:
|
44 |
+
# Ignorar adverbios y otras categorías gramaticales
|
45 |
+
if token.pos_ not in ['ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'PRON', 'SCONJ', 'SYM', 'X']:
|
46 |
+
filtered_words.append(token.text)
|
47 |
+
return filtered_words
|
48 |
+
|
49 |
+
# Obtener oraciones de ejemplo de múltiples corpus de nltk
|
50 |
+
def get_example_sentences(word, num_sentences=1):
|
51 |
+
sentences = []
|
52 |
+
for corpus in [cess_esp, conll2002]:
|
53 |
+
for sent in corpus.sents():
|
54 |
+
if word in sent and len(word) > 1:
|
55 |
+
sentences.append(' '.join(sent))
|
56 |
+
if len(sentences) >= num_sentences:
|
57 |
+
break
|
58 |
+
if len(sentences) >= num_sentences:
|
59 |
+
break
|
60 |
+
return sentences
|
61 |
+
|
62 |
+
# Función para generar la nube de palabras con estilo aleatorio
|
63 |
+
def generate_random_style_cloud(words, filename):
|
64 |
+
text = ' '.join(words)
|
65 |
+
icons = ['fas fa-cloud', 'fas fa-star', 'fas fa-heart', 'fas fa-tree', 'fas fa-sun', 'fas fa-moon']
|
66 |
+
random_icon = random.choice(icons)
|
67 |
+
stylecloud.gen_stylecloud(text=text, icon_name=random_icon, output_name=filename)
|
68 |
+
img = plt.imread(filename)
|
69 |
+
plt.imshow(img)
|
70 |
+
plt.axis('off')
|
71 |
+
plt.savefig(filename, bbox_inches='tight')
|
72 |
+
return filename
|
73 |
+
|
74 |
+
# Crear el documento PDF
|
75 |
+
class PDF(FPDF):
|
76 |
+
def header(self):
|
77 |
+
self.set_fill_color(200, 220, 255)
|
78 |
+
self.rect(0, 0, 10, 297, 'F')
|
79 |
+
self.rect(200, 0, 10, 297, 'F')
|
80 |
+
|
81 |
+
def footer(self):
|
82 |
+
self.set_y(-15)
|
83 |
+
self.set_font('Arial', 'I', 8)
|
84 |
+
self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
|
85 |
+
|
86 |
+
def add_text_to_pdf(pdf, text, title):
|
87 |
+
filtered_words = preprocess_text(text)
|
88 |
+
word_freq = Counter(filtered_words)
|
89 |
+
word_freq_file = f"word_freq_{title}.csv"
|
90 |
+
|
91 |
+
with open(word_freq_file, 'w') as f:
|
92 |
+
writer = csv.writer(f)
|
93 |
+
writer.writerow(['word', 'frequency'])
|
94 |
+
for word, freq in word_freq.items():
|
95 |
+
writer.writerow([word, freq])
|
96 |
+
|
97 |
+
cloud_filename = f'wordcloud_{title}.png'
|
98 |
+
generate_random_style_cloud(filtered_words, cloud_filename)
|
99 |
+
|
100 |
+
pdf.add_page()
|
101 |
+
pdf.set_font('Arial', 'B', 16)
|
102 |
+
pdf.cell(0, 10, title, ln=True, align='C')
|
103 |
+
pdf.set_draw_color(0, 0, 0)
|
104 |
+
pdf.set_line_width(0.5)
|
105 |
+
pdf.line(10, 25, 200, 25)
|
106 |
+
pdf.image(cloud_filename, x=15, y=30, w=180)
|
107 |
+
|
108 |
+
pdf.add_page()
|
109 |
+
pdf.set_font('Arial', 'B', 16)
|
110 |
+
pdf.cell(0, 10, "Oraciones de ejemplo", ln=True, align='C')
|
111 |
+
|
112 |
+
high_freq_words = sorted([word.upper() for word, freq in word_freq.most_common(20)])
|
113 |
+
|
114 |
+
pdf.set_font('Arial', 'B', 12)
|
115 |
+
pdf.set_fill_color(200, 200, 200)
|
116 |
+
pdf.cell(90, 10, 'PALABRA', 1, fill=True)
|
117 |
+
pdf.cell(0, 10, 'ORACIÓN DE EJEMPLO', 1, fill=True)
|
118 |
+
pdf.ln()
|
119 |
+
|
120 |
+
pdf.set_font('Arial', '', 12)
|
121 |
+
pdf.set_line_width(0.1)
|
122 |
+
for word in high_freq_words:
|
123 |
+
example_sent = get_example_sentences(word.lower())
|
124 |
+
if example_sent:
|
125 |
+
example_sentence = example_sent[0].replace(word.lower(), f'**{word}**').replace(word, f'**{word}**')
|
126 |
+
pdf.cell(90, 10, word, 1)
|
127 |
+
pdf.set_font('Arial', '', 10)
|
128 |
+
pdf.multi_cell(0, 10, example_sentence, 1)
|
129 |
+
pdf.set_font('Arial', 'I', 8)
|
130 |
+
pdf.cell(90, 10, '', 0)
|
131 |
+
pdf.cell(0, 10, 'Fuente: NLTK', 0)
|
132 |
+
pdf.set_font('Arial', '', 12)
|
133 |
+
else:
|
134 |
+
continue
|
135 |
+
pdf.ln()
|
136 |
+
|
137 |
+
def generar_pdf(text, title):
|
138 |
+
pdf = PDF()
|
139 |
+
add_text_to_pdf(pdf, text, title)
|
140 |
+
output_filename = 'documento.pdf'
|
141 |
+
pdf.output(output_filename)
|
142 |
+
return output_filename
|
143 |
+
|
144 |
+
# Crear la interfaz de Gradio
|
145 |
+
interface = gr.Interface(
|
146 |
+
fn=generar_pdf,
|
147 |
+
inputs=[gr.inputs.Textbox(lines=10, placeholder="Introduce texto en español aquí..."), gr.inputs.Textbox(lines=1, placeholder="Introduce un título aquí...")],
|
148 |
+
outputs=gr.outputs.File(label="Documento PDF generado"),
|
149 |
+
title="Generador de Nubes de Palabras y Glosario en PDF",
|
150 |
+
description="Introduce un texto en español y un título para generar un documento PDF con nubes de palabras y un glosario con oraciones de ejemplo."
|
151 |
+
)
|
152 |
+
|
153 |
+
if __name__ == "__main__":
|
154 |
+
interface.launch()
|