|  |  | 
					
						
						|  | import spacy | 
					
						
						|  | from collections import Counter | 
					
						
						|  | from spacy import displacy | 
					
						
						|  | import re | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | POS_COLORS = { | 
					
						
						|  | 'ADJ': '#FFA07A', | 
					
						
						|  | 'ADP': '#98FB98', | 
					
						
						|  | 'ADV': '#87CEFA', | 
					
						
						|  | 'AUX': '#DDA0DD', | 
					
						
						|  | 'CCONJ': '#F0E68C', | 
					
						
						|  | 'DET': '#FFB6C1', | 
					
						
						|  | 'INTJ': '#FF6347', | 
					
						
						|  | 'NOUN': '#90EE90', | 
					
						
						|  | 'NUM': '#FAFAD2', | 
					
						
						|  | 'PART': '#D3D3D3', | 
					
						
						|  | 'PRON': '#FFA500', | 
					
						
						|  | 'PROPN': '#20B2AA', | 
					
						
						|  | 'SCONJ': '#DEB887', | 
					
						
						|  | 'SYM': '#7B68EE', | 
					
						
						|  | 'VERB': '#FF69B4', | 
					
						
						|  | 'X': '#A9A9A9', | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  | POS_TRANSLATIONS = { | 
					
						
						|  | 'es': { | 
					
						
						|  | 'ADJ': 'Adjetivo', | 
					
						
						|  | 'ADP': 'Preposición', | 
					
						
						|  | 'ADV': 'Adverbio', | 
					
						
						|  | 'AUX': 'Auxiliar', | 
					
						
						|  | 'CCONJ': 'Conjunción Coordinante', | 
					
						
						|  | 'DET': 'Determinante', | 
					
						
						|  | 'INTJ': 'Interjección', | 
					
						
						|  | 'NOUN': 'Sustantivo', | 
					
						
						|  | 'NUM': 'Número', | 
					
						
						|  | 'PART': 'Partícula', | 
					
						
						|  | 'PRON': 'Pronombre', | 
					
						
						|  | 'PROPN': 'Nombre Propio', | 
					
						
						|  | 'SCONJ': 'Conjunción Subordinante', | 
					
						
						|  | 'SYM': 'Símbolo', | 
					
						
						|  | 'VERB': 'Verbo', | 
					
						
						|  | 'X': 'Otro', | 
					
						
						|  | }, | 
					
						
						|  | 'en': { | 
					
						
						|  | 'ADJ': 'Adjective', | 
					
						
						|  | 'ADP': 'Preposition', | 
					
						
						|  | 'ADV': 'Adverb', | 
					
						
						|  | 'AUX': 'Auxiliary', | 
					
						
						|  | 'CCONJ': 'Coordinating Conjunction', | 
					
						
						|  | 'DET': 'Determiner', | 
					
						
						|  | 'INTJ': 'Interjection', | 
					
						
						|  | 'NOUN': 'Noun', | 
					
						
						|  | 'NUM': 'Number', | 
					
						
						|  | 'PART': 'Particle', | 
					
						
						|  | 'PRON': 'Pronoun', | 
					
						
						|  | 'PROPN': 'Proper Noun', | 
					
						
						|  | 'SCONJ': 'Subordinating Conjunction', | 
					
						
						|  | 'SYM': 'Symbol', | 
					
						
						|  | 'VERB': 'Verb', | 
					
						
						|  | 'X': 'Other', | 
					
						
						|  | }, | 
					
						
						|  | 'fr': { | 
					
						
						|  | 'ADJ': 'Adjectif', | 
					
						
						|  | 'ADP': 'Préposition', | 
					
						
						|  | 'ADV': 'Adverbe', | 
					
						
						|  | 'AUX': 'Auxiliaire', | 
					
						
						|  | 'CCONJ': 'Conjonction de Coordination', | 
					
						
						|  | 'DET': 'Déterminant', | 
					
						
						|  | 'INTJ': 'Interjection', | 
					
						
						|  | 'NOUN': 'Nom', | 
					
						
						|  | 'NUM': 'Nombre', | 
					
						
						|  | 'PART': 'Particule', | 
					
						
						|  | 'PRON': 'Pronom', | 
					
						
						|  | 'PROPN': 'Nom Propre', | 
					
						
						|  | 'SCONJ': 'Conjonction de Subordination', | 
					
						
						|  | 'SYM': 'Symbole', | 
					
						
						|  | 'VERB': 'Verbe', | 
					
						
						|  | 'X': 'Autre', | 
					
						
						|  | } | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_repeated_words_colors(doc): | 
					
						
						|  | word_counts = Counter(token.text.lower() for token in doc if token.pos_ != 'PUNCT') | 
					
						
						|  | repeated_words = {word: count for word, count in word_counts.items() if count > 1} | 
					
						
						|  |  | 
					
						
						|  | word_colors = {} | 
					
						
						|  | for token in doc: | 
					
						
						|  | if token.text.lower() in repeated_words: | 
					
						
						|  | word_colors[token.text.lower()] = POS_COLORS.get(token.pos_, '#FFFFFF') | 
					
						
						|  |  | 
					
						
						|  | return word_colors | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def highlight_repeated_words(doc, word_colors): | 
					
						
						|  | highlighted_text = [] | 
					
						
						|  | for token in doc: | 
					
						
						|  | if token.text.lower() in word_colors: | 
					
						
						|  | color = word_colors[token.text.lower()] | 
					
						
						|  | highlighted_text.append(f'<span style="background-color: {color};">{token.text}</span>') | 
					
						
						|  | else: | 
					
						
						|  | highlighted_text.append(token.text) | 
					
						
						|  | return ' '.join(highlighted_text) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def generate_arc_diagram(doc, lang_code): | 
					
						
						|  | sentences = list(doc.sents) | 
					
						
						|  | arc_diagrams = [] | 
					
						
						|  | for sent in sentences: | 
					
						
						|  | html = displacy.render(sent, style="dep", options={"distance": 100}) | 
					
						
						|  | html = html.replace('height="375"', 'height="200"') | 
					
						
						|  | html = re.sub(r'<svg[^>]*>', lambda m: m.group(0).replace('height="450"', 'height="300"'), html) | 
					
						
						|  | html = re.sub(r'<g [^>]*transform="translate\((\d+),(\d+)\)"', lambda m: f'<g transform="translate({m.group(1)},50)"', html) | 
					
						
						|  | arc_diagrams.append(html) | 
					
						
						|  | return arc_diagrams | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_detailed_pos_analysis(doc): | 
					
						
						|  | """ | 
					
						
						|  | Realiza un análisis detallado de las categorías gramaticales (POS) en el texto. | 
					
						
						|  | """ | 
					
						
						|  | pos_counts = Counter(token.pos_ for token in doc) | 
					
						
						|  | total_tokens = len(doc) | 
					
						
						|  | pos_analysis = [] | 
					
						
						|  | for pos, count in pos_counts.items(): | 
					
						
						|  | percentage = (count / total_tokens) * 100 | 
					
						
						|  | pos_analysis.append({ | 
					
						
						|  | 'pos': pos, | 
					
						
						|  | 'count': count, | 
					
						
						|  | 'percentage': round(percentage, 2), | 
					
						
						|  | 'examples': [token.text for token in doc if token.pos_ == pos][:5] | 
					
						
						|  | }) | 
					
						
						|  | return sorted(pos_analysis, key=lambda x: x['count'], reverse=True) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_morphological_analysis(doc): | 
					
						
						|  | """ | 
					
						
						|  | Realiza un análisis morfológico detallado de las palabras en el texto. | 
					
						
						|  | """ | 
					
						
						|  | morphology_analysis = [] | 
					
						
						|  | for token in doc: | 
					
						
						|  | if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV']: | 
					
						
						|  | morphology_analysis.append({ | 
					
						
						|  | 'text': token.text, | 
					
						
						|  | 'lemma': token.lemma_, | 
					
						
						|  | 'pos': token.pos_, | 
					
						
						|  | 'tag': token.tag_, | 
					
						
						|  | 'dep': token.dep_, | 
					
						
						|  | 'shape': token.shape_, | 
					
						
						|  | 'is_alpha': token.is_alpha, | 
					
						
						|  | 'is_stop': token.is_stop, | 
					
						
						|  | 'morph': str(token.morph) | 
					
						
						|  | }) | 
					
						
						|  | return morphology_analysis | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_sentence_structure_analysis(doc): | 
					
						
						|  | """ | 
					
						
						|  | Analiza la estructura de las oraciones en el texto. | 
					
						
						|  | """ | 
					
						
						|  | sentence_analysis = [] | 
					
						
						|  | for sent in doc.sents: | 
					
						
						|  | sentence_analysis.append({ | 
					
						
						|  | 'text': sent.text, | 
					
						
						|  | 'root': sent.root.text, | 
					
						
						|  | 'root_pos': sent.root.pos_, | 
					
						
						|  | 'num_tokens': len(sent), | 
					
						
						|  | 'num_words': len([token for token in sent if token.is_alpha]), | 
					
						
						|  | 'subjects': [token.text for token in sent if "subj" in token.dep_], | 
					
						
						|  | 'objects': [token.text for token in sent if "obj" in token.dep_], | 
					
						
						|  | 'verbs': [token.text for token in sent if token.pos_ == "VERB"] | 
					
						
						|  | }) | 
					
						
						|  | return sentence_analysis | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def perform_advanced_morphosyntactic_analysis(text, nlp): | 
					
						
						|  | """ | 
					
						
						|  | Realiza un análisis morfosintáctico avanzado del texto. | 
					
						
						|  | """ | 
					
						
						|  | doc = nlp(text) | 
					
						
						|  | return { | 
					
						
						|  | 'pos_analysis': get_detailed_pos_analysis(doc), | 
					
						
						|  | 'morphological_analysis': get_morphological_analysis(doc), | 
					
						
						|  | 'sentence_structure': get_sentence_structure_analysis(doc), | 
					
						
						|  | 'arc_diagram': generate_arc_diagram(doc, nlp.lang) | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | __all__ = ['get_repeated_words_colors', 'highlight_repeated_words', 'generate_arc_diagram', 'perform_advanced_morphosyntactic_analysis', 'POS_COLORS', 'POS_TRANSLATIONS'] |