abdalrahmanshahrour
commited on
Commit
·
7f7c4b7
1
Parent(s):
80a8eb4
Upload websit
Browse files- app.py +83 -0
- requirements.txt +72 -0
- style.css +35 -0
- summarize.py +171 -0
- text1.txt +2 -0
- text2.txt +1 -0
app.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from urllib.parse import unquote
|
2 |
+
import arabic_reshaper
|
3 |
+
import streamlit as st
|
4 |
+
from bidi.algorithm import get_display
|
5 |
+
from summarize import get_results
|
6 |
+
import time
|
7 |
+
import requests
|
8 |
+
|
9 |
+
from streamlit_lottie import st_lottie # pip install streamlit-lottie
|
10 |
+
|
11 |
+
from streamlit_lottie import st_lottie_spinner
|
12 |
+
|
13 |
+
|
14 |
+
st.set_page_config(
|
15 |
+
page_title="Arabic Summarization",
|
16 |
+
page_icon="🤖",
|
17 |
+
layout="wide",
|
18 |
+
initial_sidebar_state="expanded",
|
19 |
+
menu_items={
|
20 |
+
'Get Help': 'https://www.extremelycoolapp.com/help',
|
21 |
+
'Report a bug': "https://www.extremelycoolapp.com/bug",
|
22 |
+
'About': "# Arabic Text Summarizeation , abdalrahman shahrour",
|
23 |
+
}
|
24 |
+
)
|
25 |
+
|
26 |
+
|
27 |
+
def load_lottieurl(url: str):
|
28 |
+
r = requests.get(url)
|
29 |
+
if r.status_code != 200:
|
30 |
+
return None
|
31 |
+
return r.json()
|
32 |
+
|
33 |
+
|
34 |
+
st.header('تلخيص النصوص العربية : ')
|
35 |
+
|
36 |
+
rtl = lambda w: get_display(f"{arabic_reshaper.reshape(w)}")
|
37 |
+
|
38 |
+
with open('style.css') as f:
|
39 |
+
st.markdown(f'<style>{f.read()}</style>',unsafe_allow_html=True)
|
40 |
+
|
41 |
+
|
42 |
+
c = load_lottieurl("https://assets7.lottiefiles.com/packages/lf20_ofa3xwo7.json")
|
43 |
+
# st.sidebar.st_lottie(c)
|
44 |
+
with st.sidebar:
|
45 |
+
st_lottie(c)
|
46 |
+
st.markdown("")
|
47 |
+
|
48 |
+
|
49 |
+
model = st.sidebar.selectbox('Select one', ['arabartsummarization', 'AraBART', 'auto-arabic-summarization', 'BERT2BERT', 'xlmroberta2xlmroberta', 'nltk_summarizer'],help="Model",)
|
50 |
+
|
51 |
+
|
52 |
+
st.sidebar.write("\n")
|
53 |
+
num_beams = st.sidebar.slider(
|
54 |
+
"Number of beams", min_value=1, max_value=10, value=3, step=1
|
55 |
+
)
|
56 |
+
|
57 |
+
st.sidebar.write("\n")
|
58 |
+
length_penalty = st.sidebar.slider(
|
59 |
+
"Length penalty ", min_value=0.1, max_value=3.0, value=1.0, step=0.1,
|
60 |
+
)
|
61 |
+
st.sidebar.write("\n")
|
62 |
+
number_of_sentence = st.sidebar.slider(
|
63 |
+
"Number of sentence", min_value=1, max_value=10, value=3, step=1
|
64 |
+
)
|
65 |
+
|
66 |
+
st.sidebar.write("\n")
|
67 |
+
height = st.sidebar.slider(
|
68 |
+
"height", min_value=200, max_value=1000, value=350, step=20
|
69 |
+
)
|
70 |
+
|
71 |
+
|
72 |
+
doc = st.text_area("ضع هنا النص المراد تلخيصه : ",height=height,value="""يجري علماء في بريطانيا تجربة لاختبار فعالية عقار إيبوبروفين لمساعدة المصابين بفيروس كورونا. وذكرت هيئة الإذاعة البريطانية "بي بي سي" أن فريق مشترك من أطباء مستشفيات "جاي" و"سانت توماس" و"كينغز كوليدج" في لندن يعتقد أن إيبوبروفين، وهو مضاد للالتهابات ومسكن للألم، يمكن أن يعالج صعوبات التنفس.
|
73 |
+
ويأمل العلماء أن يساعد هذا العلاج المنخفض التكلفة المرضى في الاستغناء عن أجهزة التنفس الصناعي. وذكرت أنه خلال فترة الاختبار، سيحصل نصف المرضى على إيبوبروفين بالإضافة إلى الرعاية المعتادة، حيث سيتم استخدام تركيبة خاصة من إيبوبروفين بدلا من الأقراص العادية التي قد يشتريها الناس عادة.""")
|
74 |
+
|
75 |
+
summarize_button = st.button(label="لخص النص")
|
76 |
+
|
77 |
+
if summarize_button:
|
78 |
+
with st.spinner("جاري التلخيص ..."):
|
79 |
+
result = get_results(doc, model, num_beams, length_penalty,number_of_sentence)
|
80 |
+
if len(result) > 0:
|
81 |
+
st.write(result)
|
82 |
+
else:
|
83 |
+
st.write("")
|
requirements.txt
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==4.2.0
|
2 |
+
arabert==1.0.1
|
3 |
+
arabic-reshaper==2.1.3
|
4 |
+
attrs==22.2.0
|
5 |
+
blinker==1.5
|
6 |
+
cachetools==5.2.0
|
7 |
+
certifi==2022.12.7
|
8 |
+
charset-normalizer==2.1.1
|
9 |
+
click==8.1.3
|
10 |
+
codetiming==1.3.0
|
11 |
+
commonmark==0.9.1
|
12 |
+
decorator==5.1.1
|
13 |
+
emoji==1.4.2
|
14 |
+
entrypoints==0.4
|
15 |
+
farasapy==0.0.14
|
16 |
+
filelock==3.8.2
|
17 |
+
future==0.18.2
|
18 |
+
gitdb==4.0.10
|
19 |
+
GitPython==3.1.29
|
20 |
+
huggingface-hub==0.11.1
|
21 |
+
idna==3.4
|
22 |
+
importlib-metadata==5.2.0
|
23 |
+
Jinja2==3.1.2
|
24 |
+
joblib==1.2.0
|
25 |
+
jsonschema==4.17.3
|
26 |
+
MarkupSafe==2.1.1
|
27 |
+
nltk==3.8
|
28 |
+
numpy==1.24.0
|
29 |
+
nvidia-cublas-cu11==11.10.3.66
|
30 |
+
nvidia-cuda-nvrtc-cu11==11.7.99
|
31 |
+
nvidia-cuda-runtime-cu11==11.7.99
|
32 |
+
nvidia-cudnn-cu11==8.5.0.96
|
33 |
+
packaging==22.0
|
34 |
+
pandas==1.5.2
|
35 |
+
Pillow==9.3.0
|
36 |
+
preprocess==2.0.0
|
37 |
+
protobuf==3.20.2
|
38 |
+
PyArabic==0.6.15
|
39 |
+
pyarrow==10.0.1
|
40 |
+
pydeck==0.8.0
|
41 |
+
Pygments==2.13.0
|
42 |
+
Pympler==1.0.1
|
43 |
+
pyrsistent==0.19.2
|
44 |
+
python-bidi==0.4.2
|
45 |
+
python-dateutil==2.8.2
|
46 |
+
pytz==2022.7
|
47 |
+
pytz-deprecation-shim==0.1.0.post0
|
48 |
+
PyYAML==6.0
|
49 |
+
regex==2022.10.31
|
50 |
+
requests==2.28.1
|
51 |
+
rich==12.6.0
|
52 |
+
semver==2.13.0
|
53 |
+
sentencepiece==0.1.97
|
54 |
+
six==1.16.0
|
55 |
+
smmap==5.0.0
|
56 |
+
streamlit==1.16.0
|
57 |
+
streamlit-lottie==0.0.3
|
58 |
+
streamlit-option-menu==0.3.2
|
59 |
+
tokenizers==0.13.2
|
60 |
+
toml==0.10.2
|
61 |
+
toolz==0.12.0
|
62 |
+
torch==1.13.1
|
63 |
+
tornado==6.2
|
64 |
+
tqdm==4.64.1
|
65 |
+
transformers==4.25.1
|
66 |
+
typing_extensions==4.4.0
|
67 |
+
tzdata==2022.7
|
68 |
+
tzlocal==4.2
|
69 |
+
urllib3==1.26.13
|
70 |
+
validators==0.20.0
|
71 |
+
watchdog==2.2.0
|
72 |
+
zipp==3.11.0
|
style.css
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.stMarkdown h1, .main .element-container.css-o7ulmj.e1tzin5v3 {
|
2 |
+
text-align: right;
|
3 |
+
}
|
4 |
+
.stMarkdown div.css-nlntq9.e16nr0p33 {
|
5 |
+
font-weight: bold;
|
6 |
+
}
|
7 |
+
textarea {
|
8 |
+
direction: rtl;
|
9 |
+
height: 140px;
|
10 |
+
}
|
11 |
+
.stTextArea .css-qrbaxs {
|
12 |
+
float: right;
|
13 |
+
font-size: 23px;
|
14 |
+
}
|
15 |
+
h1 {
|
16 |
+
font-family: 'Scheherazade', serif;
|
17 |
+
}
|
18 |
+
.main div.css-nlntq9.e16nr0p33 > p {
|
19 |
+
direction: rtl;
|
20 |
+
}
|
21 |
+
.main .stMarkdown div.css-nlntq9 p {
|
22 |
+
font-size: 22px;
|
23 |
+
}
|
24 |
+
.main .stMarkdown div.css-nlntq9 {
|
25 |
+
direction: rtl;
|
26 |
+
}
|
27 |
+
.main p, .main div, .main input, .main label {
|
28 |
+
text-align: right;
|
29 |
+
direction: rtl;
|
30 |
+
}
|
31 |
+
|
32 |
+
}
|
33 |
+
.main button {
|
34 |
+
font-size: 22px;
|
35 |
+
}
|
summarize.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
from functools import lru_cache
|
5 |
+
from urllib.parse import unquote
|
6 |
+
|
7 |
+
import streamlit as st
|
8 |
+
from codetiming import Timer
|
9 |
+
from transformers import pipeline
|
10 |
+
from arabert.preprocess import ArabertPreprocessor
|
11 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
|
12 |
+
import tokenizers
|
13 |
+
import re
|
14 |
+
import heapq
|
15 |
+
from string import punctuation
|
16 |
+
import nltk
|
17 |
+
from nltk.corpus import stopwords
|
18 |
+
|
19 |
+
punctuation = punctuation + '\n'
|
20 |
+
logger = logging.getLogger(__name__)
|
21 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
22 |
+
|
23 |
+
logger.info("Loading models...")
|
24 |
+
reader_time = Timer("loading", text="Time: {:.2f}", logger=logging.info)
|
25 |
+
reader_time.start()
|
26 |
+
|
27 |
+
|
28 |
+
reader_time.stop()
|
29 |
+
|
30 |
+
|
31 |
+
logger.info("Finished loading the models...")
|
32 |
+
logger.info(f"Time spent loading: {reader_time.last}")
|
33 |
+
|
34 |
+
@lru_cache(maxsize=200)
|
35 |
+
def get_results(text, model_selected, num_beams, length_penalty,number_of_sentence):
|
36 |
+
logger.info("\n=================================================================")
|
37 |
+
logger.info(f"Text: {text}")
|
38 |
+
logger.info(f"model_selected: {model_selected}")
|
39 |
+
logger.info(f"length_penalty: {length_penalty}")
|
40 |
+
reader_time = Timer("summarize", text="Time: {:.2f}", logger=logging.info)
|
41 |
+
reader_time.start()
|
42 |
+
if model_selected == 'GPT-2':
|
43 |
+
number_of_tokens_limit = 80
|
44 |
+
else:
|
45 |
+
number_of_tokens_limit = 150
|
46 |
+
logger.info(f"input length: {len(text.split())}")
|
47 |
+
|
48 |
+
if model_selected == 'arabartsummarization':
|
49 |
+
model_name="abdalrahmanshahrour/arabartsummarization"
|
50 |
+
preprocessor = ArabertPreprocessor(model_name="")
|
51 |
+
|
52 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
53 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
54 |
+
pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
|
55 |
+
result = pipeline1(text,
|
56 |
+
pad_token_id= tokenizer.eos_token_id,
|
57 |
+
num_beams=num_beams,
|
58 |
+
repetition_penalty=3.0,
|
59 |
+
max_length=200,
|
60 |
+
length_penalty=length_penalty,
|
61 |
+
no_repeat_ngram_size = 3)[0]['generated_text']
|
62 |
+
logger.info('arabartsummarization')
|
63 |
+
elif model_selected == 'AraBART':
|
64 |
+
|
65 |
+
model_name= "abdalrahmanshahrour/AraBART-summ"
|
66 |
+
preprocessor = ArabertPreprocessor(model_name="")
|
67 |
+
|
68 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
69 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
70 |
+
pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
|
71 |
+
result = pipeline1(text,
|
72 |
+
pad_token_id= tokenizer.eos_token_id,
|
73 |
+
num_beams=num_beams,
|
74 |
+
repetition_penalty=3.0,
|
75 |
+
max_length=200,
|
76 |
+
length_penalty=length_penalty,
|
77 |
+
no_repeat_ngram_size = 3)[0]['generated_text']
|
78 |
+
logger.info('AraBART')
|
79 |
+
|
80 |
+
elif model_selected == "auto-arabic-summarization":
|
81 |
+
|
82 |
+
model_name="abdalrahmanshahrour/auto-arabic-summarization"
|
83 |
+
preprocessor = ArabertPreprocessor(model_name="")
|
84 |
+
|
85 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
86 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
87 |
+
pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
|
88 |
+
result = pipeline1(text,
|
89 |
+
pad_token_id= tokenizer.eos_token_id,
|
90 |
+
num_beams=num_beams,
|
91 |
+
repetition_penalty=3.0,
|
92 |
+
max_length=200,
|
93 |
+
length_penalty=length_penalty,
|
94 |
+
no_repeat_ngram_size = 3)[0]['generated_text']
|
95 |
+
logger.info('auto-arabic-summarization')
|
96 |
+
|
97 |
+
elif model_selected == 'BERT2BERT':
|
98 |
+
|
99 |
+
model_name="malmarjeh/bert2bert"
|
100 |
+
preprocessor = ArabertPreprocessor(model_name="")
|
101 |
+
|
102 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
103 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
104 |
+
pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
|
105 |
+
result = pipeline1(text,
|
106 |
+
pad_token_id= tokenizer.eos_token_id,
|
107 |
+
num_beams=num_beams,
|
108 |
+
repetition_penalty=3.0,
|
109 |
+
max_length=200,
|
110 |
+
length_penalty=length_penalty,
|
111 |
+
no_repeat_ngram_size = 3)[0]['generated_text']
|
112 |
+
logger.info('BERT2BERT')
|
113 |
+
|
114 |
+
elif model_selected == "xlmroberta2xlmroberta":
|
115 |
+
model_name="ahmeddbahaa/xlmroberta2xlmroberta-finetune-summarization-ar"
|
116 |
+
preprocessor = ArabertPreprocessor(model_name="")
|
117 |
+
|
118 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
119 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
120 |
+
pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
|
121 |
+
result = pipeline1(text,
|
122 |
+
pad_token_id= tokenizer.eos_token_id,
|
123 |
+
num_beams=num_beams,
|
124 |
+
repetition_penalty=3.0,
|
125 |
+
max_length=200,
|
126 |
+
length_penalty=length_penalty,
|
127 |
+
no_repeat_ngram_size = 3)[0]['generated_text']
|
128 |
+
logger.info('xlmroberta2xlmroberta')
|
129 |
+
|
130 |
+
elif model_selected == "nltk_summarizer":
|
131 |
+
# number_of_sentence = 3
|
132 |
+
stopWords = set(nltk.corpus.stopwords.words("arabic") + nltk.corpus.stopwords.words("english"))
|
133 |
+
word_frequencies = {}
|
134 |
+
for word in nltk.word_tokenize(text):
|
135 |
+
if word not in stopWords:
|
136 |
+
if word not in punctuation:
|
137 |
+
if word not in word_frequencies.keys():
|
138 |
+
word_frequencies[word] = 1
|
139 |
+
else:
|
140 |
+
word_frequencies[word] += 1
|
141 |
+
|
142 |
+
maximum_frequncy = max(list(word_frequencies.values()),default=3)
|
143 |
+
|
144 |
+
for word in word_frequencies.keys():
|
145 |
+
word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
|
146 |
+
|
147 |
+
sentence_list = nltk.sent_tokenize(text)
|
148 |
+
sentence_scores = {}
|
149 |
+
for sent in sentence_list:
|
150 |
+
for word in nltk.word_tokenize(sent.lower()):
|
151 |
+
if word in word_frequencies.keys():
|
152 |
+
if len(sent.split(' ')) < 30:
|
153 |
+
if sent not in sentence_scores.keys():
|
154 |
+
sentence_scores[sent] = word_frequencies[word]
|
155 |
+
else:
|
156 |
+
sentence_scores[sent] += word_frequencies[word]
|
157 |
+
|
158 |
+
summary_sentences = heapq.nlargest(number_of_sentence, sentence_scores, key=sentence_scores.get)
|
159 |
+
|
160 |
+
result = ' '.join(summary_sentences)
|
161 |
+
else:
|
162 |
+
result = "الرجاء اختيار نموذج"
|
163 |
+
|
164 |
+
reader_time.stop()
|
165 |
+
logger.info(f"Time spent summarizing: {reader_time.last}")
|
166 |
+
|
167 |
+
return result
|
168 |
+
|
169 |
+
|
170 |
+
if __name__ == "__main__":
|
171 |
+
results_dict = ""
|
text1.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
يجري علماء في بريطانيا تجربة لاختبار فعالية عقار إيبوبروفين لمساعدة المصابين بفيروس كورونا. وذكرت هيئة الإذاعة البريطانية "بي بي سي" أن فريق مشترك من أطباء مستشفيات "جاي" و"سانت توماس" و"كينغز كوليدج" في لندن يعتقد أن إيبوبروفين، وهو مضاد للالتهابات ومسكن للألم، يمكن أن يعالج صعوبات التنفس.
|
2 |
+
ويأمل العلماء أن يساعد هذا العلاج المنخفض التكلفة المرضى في الاستغناء عن أجهزة التنفس الصناعي. وذكرت أنه خلال فترة الاختبار، سيحصل نصف المرضى على إيبوبروفين بالإضافة إلى الرعاية المعتادة، حيث سيتم استخدام تركيبة خاصة من إيبوبروفين بدلا من الأقراص العادية التي قد يشتريها الناس عادة.
|
text2.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
شهدت مدينة طرابلس، مساء أمس الأربعاء، احتجاجات شعبية وأعمال شغب لليوم الثالث على التوالي، وذلك بسبب تردي الوضع المعيشي والاقتصادي. واندلعت مواجهات عنيفة وعمليات كر وفر ما بين الجيش اللبناني والمحتجين استمرت لساعات، إثر محاولة فتح الطرقات المقطوعة، ما أدى إلى إصابة العشرات من الطرفين.
|