Commit 
							
							·
						
						b49c7c6
	
1
								Parent(s):
							
							7adca4e
								
Allegements
Browse files- requirements.txt +0 -10
- tabs/data_viz_tab.py +0 -3
- tabs/exploration_tab.py +1 -2
- tabs/id_lang_tab.py +0 -3
- tabs/modelisation_dict_tab.py +0 -1
- tabs/modelisation_seq2seq_tab.py +26 -16
    	
        requirements.txt
    CHANGED
    
    | @@ -6,16 +6,13 @@ numpy==1.23.5 | |
| 6 | 
             
            seaborn==0.13.2
         | 
| 7 | 
             
            nltk==3.8.1
         | 
| 8 | 
             
            scikit-learn==1.1.3
         | 
| 9 | 
            -
            scikit-learn-extra==0.3.0
         | 
| 10 | 
             
            gensim==4.3.2
         | 
| 11 | 
             
            sacrebleu==2.4.0
         | 
| 12 | 
            -
            pyspellchecker==0.8.1
         | 
| 13 | 
             
            spacy==3.6.0
         | 
| 14 | 
             
            https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz 
         | 
| 15 | 
             
            https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.6.0/fr_core_news_sm-3.6.0.tar.gz
         | 
| 16 | 
             
            pillow==9.5.0
         | 
| 17 | 
             
            wordcloud==1.9.3
         | 
| 18 | 
            -
            pathlib==1.0.1
         | 
| 19 | 
             
            networkx==2.7.0
         | 
| 20 | 
             
            transformers==4.37.2
         | 
| 21 | 
             
            keras-nlp==0.6.1
         | 
| @@ -23,13 +20,9 @@ keras==2.12.0 | |
| 23 | 
             
            tensorflow==2.12.0
         | 
| 24 | 
             
            sentencepiece==0.1.99
         | 
| 25 | 
             
            openai-whisper==20231117
         | 
| 26 | 
            -
            sounddevice==0.4.6
         | 
| 27 | 
             
            torch==2.2.0
         | 
| 28 | 
            -
            xformers==0.0.24
         | 
| 29 | 
            -
            translate==3.6.1
         | 
| 30 | 
             
            speechrecognition==3.10.1
         | 
| 31 | 
             
            audio_recorder_streamlit==0.0.8
         | 
| 32 | 
            -
            wave==0.0.2
         | 
| 33 | 
             
            whisper==1.1.10
         | 
| 34 | 
             
            wavio==0.0.8
         | 
| 35 | 
             
            filesplit==4.0.1
         | 
| @@ -39,7 +32,4 @@ graphviz==0.20.1 | |
| 39 | 
             
            gTTS==2.5.1
         | 
| 40 | 
             
            https://files.pythonhosted.org/packages/cc/58/96aff0e5cb8b59c06232ea7e249ed902d04ec89f52636f5be06ceb0855fe/extra_streamlit_components-0.1.60-py3-none-any.whl
         | 
| 41 | 
             
            streamlit-option-menu==0.3.12
         | 
| 42 | 
            -
            plotly==5.18.0
         | 
| 43 | 
            -
            bokeh==3.3.4
         | 
| 44 | 
            -
            shap==0.44.1
         | 
| 45 | 
             
            deep-translator==1.11.4
         | 
|  | |
| 6 | 
             
            seaborn==0.13.2
         | 
| 7 | 
             
            nltk==3.8.1
         | 
| 8 | 
             
            scikit-learn==1.1.3
         | 
|  | |
| 9 | 
             
            gensim==4.3.2
         | 
| 10 | 
             
            sacrebleu==2.4.0
         | 
|  | |
| 11 | 
             
            spacy==3.6.0
         | 
| 12 | 
             
            https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz 
         | 
| 13 | 
             
            https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.6.0/fr_core_news_sm-3.6.0.tar.gz
         | 
| 14 | 
             
            pillow==9.5.0
         | 
| 15 | 
             
            wordcloud==1.9.3
         | 
|  | |
| 16 | 
             
            networkx==2.7.0
         | 
| 17 | 
             
            transformers==4.37.2
         | 
| 18 | 
             
            keras-nlp==0.6.1
         | 
|  | |
| 20 | 
             
            tensorflow==2.12.0
         | 
| 21 | 
             
            sentencepiece==0.1.99
         | 
| 22 | 
             
            openai-whisper==20231117
         | 
|  | |
| 23 | 
             
            torch==2.2.0
         | 
|  | |
|  | |
| 24 | 
             
            speechrecognition==3.10.1
         | 
| 25 | 
             
            audio_recorder_streamlit==0.0.8
         | 
|  | |
| 26 | 
             
            whisper==1.1.10
         | 
| 27 | 
             
            wavio==0.0.8
         | 
| 28 | 
             
            filesplit==4.0.1
         | 
|  | |
| 32 | 
             
            gTTS==2.5.1
         | 
| 33 | 
             
            https://files.pythonhosted.org/packages/cc/58/96aff0e5cb8b59c06232ea7e249ed902d04ec89f52636f5be06ceb0855fe/extra_streamlit_components-0.1.60-py3-none-any.whl
         | 
| 34 | 
             
            streamlit-option-menu==0.3.12
         | 
|  | |
|  | |
|  | |
| 35 | 
             
            deep-translator==1.11.4
         | 
    	
        tabs/data_viz_tab.py
    CHANGED
    
    | @@ -7,9 +7,6 @@ import numpy as np | |
| 7 | 
             
            import pandas as pd
         | 
| 8 | 
             
            import matplotlib.pyplot as plt
         | 
| 9 | 
             
            import seaborn as sns
         | 
| 10 | 
            -
            import plotly.express as px
         | 
| 11 | 
            -
            import plotly.graph_objects as go
         | 
| 12 | 
            -
            import plotly.figure_factory as ff
         | 
| 13 | 
             
            from wordcloud import WordCloud
         | 
| 14 | 
             
            import nltk
         | 
| 15 | 
             
            from nltk.corpus import stopwords
         | 
|  | |
| 7 | 
             
            import pandas as pd
         | 
| 8 | 
             
            import matplotlib.pyplot as plt
         | 
| 9 | 
             
            import seaborn as sns
         | 
|  | |
|  | |
|  | |
| 10 | 
             
            from wordcloud import WordCloud
         | 
| 11 | 
             
            import nltk
         | 
| 12 | 
             
            from nltk.corpus import stopwords
         | 
    	
        tabs/exploration_tab.py
    CHANGED
    
    | @@ -1,6 +1,5 @@ | |
| 1 | 
             
            import streamlit as st
         | 
| 2 | 
             
            import os
         | 
| 3 | 
            -
            import numpy as np
         | 
| 4 | 
             
            import pandas as pd
         | 
| 5 | 
             
            import collections
         | 
| 6 | 
             
            from nltk.tokenize import word_tokenize
         | 
| @@ -8,7 +7,7 @@ from nltk import download | |
| 8 | 
             
            from ast import literal_eval
         | 
| 9 | 
             
            from translate_app import tr
         | 
| 10 | 
             
            if st.session_state.Cloud == 0:
         | 
| 11 | 
            -
                import nltk
         | 
| 12 | 
             
                import contextlib
         | 
| 13 | 
             
                import re
         | 
| 14 | 
             
                from nltk.corpus import stopwords
         | 
|  | |
| 1 | 
             
            import streamlit as st
         | 
| 2 | 
             
            import os
         | 
|  | |
| 3 | 
             
            import pandas as pd
         | 
| 4 | 
             
            import collections
         | 
| 5 | 
             
            from nltk.tokenize import word_tokenize
         | 
|  | |
| 7 | 
             
            from ast import literal_eval
         | 
| 8 | 
             
            from translate_app import tr
         | 
| 9 | 
             
            if st.session_state.Cloud == 0:
         | 
| 10 | 
            +
                # import nltk
         | 
| 11 | 
             
                import contextlib
         | 
| 12 | 
             
                import re
         | 
| 13 | 
             
                from nltk.corpus import stopwords
         | 
    	
        tabs/id_lang_tab.py
    CHANGED
    
    | @@ -1,10 +1,8 @@ | |
| 1 | 
             
            import streamlit as st
         | 
| 2 | 
             
            import pandas as pd
         | 
| 3 | 
             
            import numpy as np
         | 
| 4 | 
            -
            import os
         | 
| 5 | 
             
            import matplotlib.pyplot as plt
         | 
| 6 | 
             
            import tiktoken
         | 
| 7 | 
            -
            import random
         | 
| 8 | 
             
            import joblib
         | 
| 9 | 
             
            import json
         | 
| 10 | 
             
            import csv
         | 
| @@ -12,7 +10,6 @@ from transformers import pipeline | |
| 12 | 
             
            import keras
         | 
| 13 | 
             
            from tensorflow.keras.preprocessing.sequence import pad_sequences
         | 
| 14 | 
             
            from sklearn.preprocessing import LabelEncoder
         | 
| 15 | 
            -
            from sklearn.feature_extraction.text import CountVectorizer
         | 
| 16 | 
             
            from tensorflow.keras.utils import plot_model
         | 
| 17 | 
             
            from filesplit.merge import Merge
         | 
| 18 | 
             
            from extra_streamlit_components import tab_bar, TabBarItemData
         | 
|  | |
| 1 | 
             
            import streamlit as st
         | 
| 2 | 
             
            import pandas as pd
         | 
| 3 | 
             
            import numpy as np
         | 
|  | |
| 4 | 
             
            import matplotlib.pyplot as plt
         | 
| 5 | 
             
            import tiktoken
         | 
|  | |
| 6 | 
             
            import joblib
         | 
| 7 | 
             
            import json
         | 
| 8 | 
             
            import csv
         | 
|  | |
| 10 | 
             
            import keras
         | 
| 11 | 
             
            from tensorflow.keras.preprocessing.sequence import pad_sequences
         | 
| 12 | 
             
            from sklearn.preprocessing import LabelEncoder
         | 
|  | |
| 13 | 
             
            from tensorflow.keras.utils import plot_model
         | 
| 14 | 
             
            from filesplit.merge import Merge
         | 
| 15 | 
             
            from extra_streamlit_components import tab_bar, TabBarItemData
         | 
    	
        tabs/modelisation_dict_tab.py
    CHANGED
    
    | @@ -1,6 +1,5 @@ | |
| 1 | 
             
            import streamlit as st
         | 
| 2 | 
             
            import pandas as pd
         | 
| 3 | 
            -
            import numpy as np
         | 
| 4 | 
             
            import os
         | 
| 5 | 
             
            from sacrebleu import corpus_bleu
         | 
| 6 | 
             
            if st.session_state.Cloud == 0:
         | 
|  | |
| 1 | 
             
            import streamlit as st
         | 
| 2 | 
             
            import pandas as pd
         | 
|  | |
| 3 | 
             
            import os
         | 
| 4 | 
             
            from sacrebleu import corpus_bleu
         | 
| 5 | 
             
            if st.session_state.Cloud == 0:
         | 
    	
        tabs/modelisation_seq2seq_tab.py
    CHANGED
    
    | @@ -4,12 +4,12 @@ import numpy as np | |
| 4 | 
             
            import os
         | 
| 5 | 
             
            from sacrebleu import corpus_bleu
         | 
| 6 | 
             
            from transformers import pipeline
         | 
| 7 | 
            -
            from translate import Translator
         | 
|  | |
| 8 | 
             
            from audio_recorder_streamlit import audio_recorder
         | 
| 9 | 
             
            import speech_recognition as sr
         | 
| 10 | 
             
            import whisper
         | 
| 11 | 
             
            import io
         | 
| 12 | 
            -
            # import wave
         | 
| 13 | 
             
            import wavio
         | 
| 14 | 
             
            from filesplit.merge import Merge
         | 
| 15 | 
             
            import tensorflow as tf
         | 
| @@ -19,7 +19,7 @@ from tensorflow import keras | |
| 19 | 
             
            from keras_nlp.layers import TransformerEncoder
         | 
| 20 | 
             
            from tensorflow.keras import layers
         | 
| 21 | 
             
            from tensorflow.keras.utils import plot_model
         | 
| 22 | 
            -
            from PIL import Image
         | 
| 23 | 
             
            from gtts import gTTS
         | 
| 24 | 
             
            from extra_streamlit_components import tab_bar, TabBarItemData
         | 
| 25 | 
             
            from translate_app import tr
         | 
| @@ -463,7 +463,8 @@ def run(): | |
| 463 | 
             
                    with col2:
         | 
| 464 | 
             
                        st.write(":red[**Trad. Google Translate**]")
         | 
| 465 | 
             
                        try:
         | 
| 466 | 
            -
                            translator = Translator(to_lang=l_tgt, from_lang=Lang_detected)
         | 
|  | |
| 467 | 
             
                            if custom_sentence!="":
         | 
| 468 | 
             
                                translation = translator.translate(custom_sentence)
         | 
| 469 | 
             
                                st.write("**"+l_tgt+" :**  "+translation)
         | 
| @@ -488,31 +489,39 @@ def run(): | |
| 488 | 
             
                        st.write("## **"+tr("Résultats")+" :**\n")
         | 
| 489 | 
             
                        st.audio(audio_bytes, format="audio/wav")
         | 
| 490 | 
             
                        try:
         | 
| 491 | 
            -
                             | 
| 492 | 
            -
             | 
| 493 | 
            -
                                audio_stream_bytesio = io.BytesIO(audio_bytes)
         | 
| 494 |  | 
| 495 | 
            -
             | 
| 496 | 
            -
             | 
| 497 |  | 
| 498 | 
            -
             | 
| 499 | 
            -
             | 
| 500 |  | 
| 501 | 
            -
             | 
| 502 | 
            -
             | 
| 503 | 
            -
             | 
| 504 | 
            -
             | 
|  | |
| 505 | 
             
                                result = model_speech.transcribe(audio_input)
         | 
| 506 | 
             
                                st.write(tr("Langue détectée")+" : "+result["language"])
         | 
| 507 | 
             
                                Lang_detected = result["language"]
         | 
| 508 | 
             
                                # Transcription Whisper (si result a été préalablement calculé)
         | 
| 509 | 
             
                                custom_sentence = result["text"]
         | 
| 510 | 
             
                            else:
         | 
|  | |
| 511 | 
             
                                Lang_detected = l_src
         | 
| 512 | 
             
                                # Transcription google
         | 
| 513 | 
             
                                audio_stream = sr.AudioData(audio_bytes, 32000, 2) 
         | 
| 514 | 
             
                                r = sr.Recognizer()
         | 
| 515 | 
             
                                custom_sentence = r.recognize_google(audio_stream, language = Lang_detected)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 516 |  | 
| 517 | 
             
                            if custom_sentence!="":
         | 
| 518 | 
             
                                # Lang_detected = lang_classifier (custom_sentence)[0]['label']
         | 
| @@ -520,7 +529,8 @@ def run(): | |
| 520 | 
             
                                st.write("")
         | 
| 521 | 
             
                                st.write("**"+Lang_detected+" :**  :blue["+custom_sentence+"]")
         | 
| 522 | 
             
                                st.write("")
         | 
| 523 | 
            -
                                translator = Translator(to_lang=l_tgt, from_lang=Lang_detected)
         | 
|  | |
| 524 | 
             
                                translation = translator.translate(custom_sentence)
         | 
| 525 | 
             
                                st.write("**"+l_tgt+" :**  "+translation)
         | 
| 526 | 
             
                                st.write("")
         | 
|  | |
| 4 | 
             
            import os
         | 
| 5 | 
             
            from sacrebleu import corpus_bleu
         | 
| 6 | 
             
            from transformers import pipeline
         | 
| 7 | 
            +
            # from translate import Translator
         | 
| 8 | 
            +
            from deep_translator import GoogleTranslator
         | 
| 9 | 
             
            from audio_recorder_streamlit import audio_recorder
         | 
| 10 | 
             
            import speech_recognition as sr
         | 
| 11 | 
             
            import whisper
         | 
| 12 | 
             
            import io
         | 
|  | |
| 13 | 
             
            import wavio
         | 
| 14 | 
             
            from filesplit.merge import Merge
         | 
| 15 | 
             
            import tensorflow as tf
         | 
|  | |
| 19 | 
             
            from keras_nlp.layers import TransformerEncoder
         | 
| 20 | 
             
            from tensorflow.keras import layers
         | 
| 21 | 
             
            from tensorflow.keras.utils import plot_model
         | 
| 22 | 
            +
            # from PIL import Image
         | 
| 23 | 
             
            from gtts import gTTS
         | 
| 24 | 
             
            from extra_streamlit_components import tab_bar, TabBarItemData
         | 
| 25 | 
             
            from translate_app import tr
         | 
|  | |
| 463 | 
             
                    with col2:
         | 
| 464 | 
             
                        st.write(":red[**Trad. Google Translate**]")
         | 
| 465 | 
             
                        try:
         | 
| 466 | 
            +
                            # translator = Translator(to_lang=l_tgt, from_lang=Lang_detected)
         | 
| 467 | 
            +
                            translator = GoogleTranslator(source=Lang_detected, target=l_tgt)
         | 
| 468 | 
             
                            if custom_sentence!="":
         | 
| 469 | 
             
                                translation = translator.translate(custom_sentence)
         | 
| 470 | 
             
                                st.write("**"+l_tgt+" :**  "+translation)
         | 
|  | |
| 489 | 
             
                        st.write("## **"+tr("Résultats")+" :**\n")
         | 
| 490 | 
             
                        st.audio(audio_bytes, format="audio/wav")
         | 
| 491 | 
             
                        try:
         | 
| 492 | 
            +
                            # Create a BytesIO object from the audio stream
         | 
| 493 | 
            +
                            audio_stream_bytesio = io.BytesIO(audio_bytes)
         | 
|  | |
| 494 |  | 
| 495 | 
            +
                            # Read the WAV stream using wavio
         | 
| 496 | 
            +
                            wav = wavio.read(audio_stream_bytesio) 
         | 
| 497 |  | 
| 498 | 
            +
                            # Extract the audio data from the wavio.Wav object
         | 
| 499 | 
            +
                            audio_data = wav.data
         | 
| 500 |  | 
| 501 | 
            +
                            # Convert the audio data to a NumPy array
         | 
| 502 | 
            +
                            audio_input = np.array(audio_data, dtype=np.float32)
         | 
| 503 | 
            +
                            audio_input = np.mean(audio_input, axis=1)/32768
         | 
| 504 | 
            +
                            
         | 
| 505 | 
            +
                            if detection:            
         | 
| 506 | 
             
                                result = model_speech.transcribe(audio_input)
         | 
| 507 | 
             
                                st.write(tr("Langue détectée")+" : "+result["language"])
         | 
| 508 | 
             
                                Lang_detected = result["language"]
         | 
| 509 | 
             
                                # Transcription Whisper (si result a été préalablement calculé)
         | 
| 510 | 
             
                                custom_sentence = result["text"]
         | 
| 511 | 
             
                            else:
         | 
| 512 | 
            +
                                # Avec l'aide de la bibliothèque speech_recognition de Google
         | 
| 513 | 
             
                                Lang_detected = l_src
         | 
| 514 | 
             
                                # Transcription google
         | 
| 515 | 
             
                                audio_stream = sr.AudioData(audio_bytes, 32000, 2) 
         | 
| 516 | 
             
                                r = sr.Recognizer()
         | 
| 517 | 
             
                                custom_sentence = r.recognize_google(audio_stream, language = Lang_detected)
         | 
| 518 | 
            +
                                
         | 
| 519 | 
            +
                                # Sans la bibliothèque speech_recognition, uniquement avec Whisper
         | 
| 520 | 
            +
                                '''
         | 
| 521 | 
            +
                                Lang_detected = l_src
         | 
| 522 | 
            +
                                result = model_speech.transcribe(audio_input, language=Lang_detected)
         | 
| 523 | 
            +
                                custom_sentence = result["text"]
         | 
| 524 | 
            +
                                '''
         | 
| 525 |  | 
| 526 | 
             
                            if custom_sentence!="":
         | 
| 527 | 
             
                                # Lang_detected = lang_classifier (custom_sentence)[0]['label']
         | 
|  | |
| 529 | 
             
                                st.write("")
         | 
| 530 | 
             
                                st.write("**"+Lang_detected+" :**  :blue["+custom_sentence+"]")
         | 
| 531 | 
             
                                st.write("")
         | 
| 532 | 
            +
                                # translator = Translator(to_lang=l_tgt, from_lang=Lang_detected)
         | 
| 533 | 
            +
                                translator = GoogleTranslator(source=Lang_detected, target=l_tgt)
         | 
| 534 | 
             
                                translation = translator.translate(custom_sentence)
         | 
| 535 | 
             
                                st.write("**"+l_tgt+" :**  "+translation)
         | 
| 536 | 
             
                                st.write("")
         |