dexttttrees commited on
Commit
99559eb
·
verified ·
1 Parent(s): f6d9a68
Files changed (1) hide show
  1. app.py +154 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import nltk
3
+ from nltk.corpus import stopwords, cess_esp, conll2002
4
+ from nltk.tokenize import word_tokenize
5
+ import stylecloud
6
+ import matplotlib.pyplot as plt
7
+ from fpdf import FPDF
8
+ import re
9
+ from collections import Counter
10
+ import spacy
11
+ import os
12
+ import random
13
+ import string
14
+ import csv
15
+
16
+ # Descargar recursos necesarios de nltk
17
+ nltk.download('punkt')
18
+ nltk.download('stopwords')
19
+ nltk.download('cess_esp')
20
+ nltk.download('conll2002')
21
+
22
+ # Cargar el modelo de spaCy para español
23
+ nlp = spacy.load('es_core_news_md')
24
+
25
+ # Lista de stopwords adicional
26
+ additional_stopwords = [
27
+ # Aquí va tu lista de stopwords adicional...
28
+ ]
29
+
30
+ # Función de preprocesamiento
31
+ def preprocess_text(text):
32
+ text = text.lower()
33
+ text = re.sub(r'\W', ' ', text)
34
+ doc = nlp(text)
35
+ stop_words = set(stopwords.words('spanish')).union(set(additional_stopwords))
36
+ filtered_words = []
37
+ for token in doc:
38
+ if token.text not in stop_words:
39
+ if token.pos_ in ['VERB', 'ADJ', 'NOUN']: # Considerar verbos, adjetivos y sustantivos
40
+ filtered_words.append(token.lemma_)
41
+ elif token.pos_ == 'NUM': # Ignorar números
42
+ continue
43
+ else:
44
+ # Ignorar adverbios y otras categorías gramaticales
45
+ if token.pos_ not in ['ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'PRON', 'SCONJ', 'SYM', 'X']:
46
+ filtered_words.append(token.text)
47
+ return filtered_words
48
+
49
+ # Obtener oraciones de ejemplo de múltiples corpus de nltk
50
+ def get_example_sentences(word, num_sentences=1):
51
+ sentences = []
52
+ for corpus in [cess_esp, conll2002]:
53
+ for sent in corpus.sents():
54
+ if word in sent and len(word) > 1:
55
+ sentences.append(' '.join(sent))
56
+ if len(sentences) >= num_sentences:
57
+ break
58
+ if len(sentences) >= num_sentences:
59
+ break
60
+ return sentences
61
+
62
+ # Función para generar la nube de palabras con estilo aleatorio
63
+ def generate_random_style_cloud(words, filename):
64
+ text = ' '.join(words)
65
+ icons = ['fas fa-cloud', 'fas fa-star', 'fas fa-heart', 'fas fa-tree', 'fas fa-sun', 'fas fa-moon']
66
+ random_icon = random.choice(icons)
67
+ stylecloud.gen_stylecloud(text=text, icon_name=random_icon, output_name=filename)
68
+ img = plt.imread(filename)
69
+ plt.imshow(img)
70
+ plt.axis('off')
71
+ plt.savefig(filename, bbox_inches='tight')
72
+ return filename
73
+
74
+ # Crear el documento PDF
75
+ class PDF(FPDF):
76
+ def header(self):
77
+ self.set_fill_color(200, 220, 255)
78
+ self.rect(0, 0, 10, 297, 'F')
79
+ self.rect(200, 0, 10, 297, 'F')
80
+
81
+ def footer(self):
82
+ self.set_y(-15)
83
+ self.set_font('Arial', 'I', 8)
84
+ self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
85
+
86
+ def add_text_to_pdf(pdf, text, title):
87
+ filtered_words = preprocess_text(text)
88
+ word_freq = Counter(filtered_words)
89
+ word_freq_file = f"word_freq_{title}.csv"
90
+
91
+ with open(word_freq_file, 'w') as f:
92
+ writer = csv.writer(f)
93
+ writer.writerow(['word', 'frequency'])
94
+ for word, freq in word_freq.items():
95
+ writer.writerow([word, freq])
96
+
97
+ cloud_filename = f'wordcloud_{title}.png'
98
+ generate_random_style_cloud(filtered_words, cloud_filename)
99
+
100
+ pdf.add_page()
101
+ pdf.set_font('Arial', 'B', 16)
102
+ pdf.cell(0, 10, title, ln=True, align='C')
103
+ pdf.set_draw_color(0, 0, 0)
104
+ pdf.set_line_width(0.5)
105
+ pdf.line(10, 25, 200, 25)
106
+ pdf.image(cloud_filename, x=15, y=30, w=180)
107
+
108
+ pdf.add_page()
109
+ pdf.set_font('Arial', 'B', 16)
110
+ pdf.cell(0, 10, "Oraciones de ejemplo", ln=True, align='C')
111
+
112
+ high_freq_words = sorted([word.upper() for word, freq in word_freq.most_common(20)])
113
+
114
+ pdf.set_font('Arial', 'B', 12)
115
+ pdf.set_fill_color(200, 200, 200)
116
+ pdf.cell(90, 10, 'PALABRA', 1, fill=True)
117
+ pdf.cell(0, 10, 'ORACIÓN DE EJEMPLO', 1, fill=True)
118
+ pdf.ln()
119
+
120
+ pdf.set_font('Arial', '', 12)
121
+ pdf.set_line_width(0.1)
122
+ for word in high_freq_words:
123
+ example_sent = get_example_sentences(word.lower())
124
+ if example_sent:
125
+ example_sentence = example_sent[0].replace(word.lower(), f'**{word}**').replace(word, f'**{word}**')
126
+ pdf.cell(90, 10, word, 1)
127
+ pdf.set_font('Arial', '', 10)
128
+ pdf.multi_cell(0, 10, example_sentence, 1)
129
+ pdf.set_font('Arial', 'I', 8)
130
+ pdf.cell(90, 10, '', 0)
131
+ pdf.cell(0, 10, 'Fuente: NLTK', 0)
132
+ pdf.set_font('Arial', '', 12)
133
+ else:
134
+ continue
135
+ pdf.ln()
136
+
137
+ def generar_pdf(text, title):
138
+ pdf = PDF()
139
+ add_text_to_pdf(pdf, text, title)
140
+ output_filename = 'documento.pdf'
141
+ pdf.output(output_filename)
142
+ return output_filename
143
+
144
+ # Crear la interfaz de Gradio
145
+ interface = gr.Interface(
146
+ fn=generar_pdf,
147
+ inputs=[gr.inputs.Textbox(lines=10, placeholder="Introduce texto en español aquí..."), gr.inputs.Textbox(lines=1, placeholder="Introduce un título aquí...")],
148
+ outputs=gr.outputs.File(label="Documento PDF generado"),
149
+ title="Generador de Nubes de Palabras y Glosario en PDF",
150
+ description="Introduce un texto en español y un título para generar un documento PDF con nubes de palabras y un glosario con oraciones de ejemplo."
151
+ )
152
+
153
+ if __name__ == "__main__":
154
+ interface.launch()