|
import requests |
|
from bs4 import BeautifulSoup |
|
import pandas as pd |
|
import re |
|
import pysbd |
|
|
|
|
|
|
|
|
|
|
|
def extract_result(sentence): |
|
match = re.search(r"(Διαγραφή|Παραμονή|Άλλο αποτέλεσμα|διαγραφή|Συγχώνευση|Διατήρηση)", sentence, flags=re.IGNORECASE) |
|
delete_cases = [ |
|
'Μη εγκυκλοπαιδικό', 'Πράγματι δεν φαίνεται πως το λήμμα είναι εγκυκλοπαιδικό', |
|
'Δεν διαπιστώθηκε εγκυκλοπαιδικότητα', 'Μη εγκυκλοπαιδικό λήμμα', |
|
'Το λήμμα κρίθηκε ότι είναι καταλληλότερο για κάποιο άλλο αδελφό εγχείρημα, παρά για την Βικιπαίδεια + ατεκμηρίωτο.', |
|
'Δεν υπάρχουν επαρκείς αναφορές για την βιογραφούμενη' |
|
] |
|
if match: |
|
outcome = match.group(1).strip() |
|
elif sentence in delete_cases: |
|
outcome = 'Διαγραφή' |
|
else: |
|
outcome = 'Δεν υπάρχει συναίνεση' |
|
return normalize_outcome(outcome) |
|
|
|
def normalize_outcome(o): |
|
lowered = o.lower() |
|
if 'διαγρ' in lowered: |
|
return 'Διαγραφή' |
|
elif 'διατήρη' in lowered or 'παραμονή' in lowered: |
|
return 'Διατήρηση' |
|
elif 'συγχών' in lowered: |
|
return 'συγχώνευση' |
|
else: |
|
|
|
return 'Δεν υπάρχει συναίνεση' |
|
|
|
def extract_discussions_from_page_collapsible(url): |
|
response = requests.get(url) |
|
if response.status_code != 200: |
|
return pd.DataFrame(columns=['title', 'discussion', 'result_sentence', 'result', 'text_url']) |
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
discussion_sections = soup.find_all('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section') |
|
titles = [] |
|
for section in discussion_sections: |
|
try: |
|
h2_tag = section.find('h2') |
|
if not h2_tag: |
|
continue |
|
title_link = h2_tag.find('a') |
|
title = title_link.text.strip() if title_link else h2_tag.get_text(strip=True) |
|
titles.append(title) |
|
except: |
|
pass |
|
|
|
discussion_tables = soup.find_all('table') |
|
if not discussion_tables: |
|
return pd.DataFrame(columns=['title', 'discussion', 'result_sentence', 'result', 'text_url']) |
|
|
|
data = [] |
|
for idx, table in enumerate(discussion_tables): |
|
try: |
|
decision_row = table.find('tr') |
|
decision_cell = decision_row.find('th') if decision_row else None |
|
if decision_cell: |
|
result_match = re.search( |
|
r"Η συζήτηση τελείωσε, το αποτέλεσμα ήταν: <i>(.*?)</i>", str(decision_cell), re.DOTALL |
|
) |
|
result_sentence = result_match.group(1).strip() if result_match else "No result found" |
|
else: |
|
result_sentence = "No result found" |
|
|
|
discussion_row = decision_row.find_next_sibling('tr') if decision_row else None |
|
discussion_cell = discussion_row.find('td', class_='plainlinks') if discussion_row else None |
|
discussion_content = discussion_cell.get_text(separator="\n") if discussion_cell else "No discussion content found" |
|
discussion_content = discussion_content.split('\nμητρώο\n)\n\n\n\n\n')[-1].replace('\n','') |
|
|
|
title = titles[idx] if idx < len(titles) else f"Discussion {idx + 1}" |
|
data.append({ |
|
"title": title, |
|
"discussion": discussion_content, |
|
"result_sentence": result_sentence, |
|
"result": extract_result(result_sentence), |
|
"text_url": url |
|
}) |
|
except: |
|
pass |
|
|
|
return pd.DataFrame(data, columns=['title', 'discussion', 'result_sentence', 'result', 'text_url']) |
|
|
|
|
|
|
|
|
|
|
|
def extract_discussions_from_page_non_collapsible(url): |
|
response = requests.get(url) |
|
if response.status_code != 200: |
|
return pd.DataFrame(columns=['title', 'discussion', 'result_sentence', 'result', 'text_url']) |
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
discussion_sections = soup.find_all('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section') |
|
titles = [] |
|
for section in discussion_sections: |
|
try: |
|
h2_tag = section.find('h2') |
|
if not h2_tag: |
|
continue |
|
title_link = h2_tag.find('a') |
|
title = title_link.text.strip() if title_link else h2_tag.get_text(strip=True) |
|
titles.append(title) |
|
except: |
|
pass |
|
|
|
discussion_tables = soup.find_all('table', class_='pagediscussion') |
|
if not discussion_tables: |
|
return pd.DataFrame(columns=['title', 'discussion', 'result_sentence', 'result', 'text_url']) |
|
|
|
data = [] |
|
for idx, table in enumerate(discussion_tables): |
|
try: |
|
decision_row = table.find('tr') |
|
decision_cell = decision_row.find('th') if decision_row else None |
|
if decision_cell: |
|
result_match = re.search( |
|
r"Η συζήτηση τελείωσε, το αποτέλεσμα ήταν: <i>(.*?)</i>", str(decision_cell), re.DOTALL |
|
) |
|
result_sentence = result_match.group(1).strip() if result_match else "No result found" |
|
else: |
|
result_sentence = "No result found" |
|
|
|
discussion_row = decision_row.find_next_sibling('tr') if decision_row else None |
|
discussion_cell = discussion_row.find('td', class_='plainlinks') if discussion_row else None |
|
discussion_content = discussion_cell.get_text(separator="\n") if discussion_cell else "No discussion content found" |
|
discussion_content = discussion_content.split('\nμητρώο\n)\n\n\n\n\n')[-1].replace('\n','') |
|
|
|
title = titles[idx] if idx < len(titles) else f"Discussion {idx + 1}" |
|
data.append({ |
|
"title": title, |
|
"discussion": discussion_content, |
|
"result_sentence": result_sentence, |
|
"result": extract_result(result_sentence), |
|
"text_url": url |
|
}) |
|
except: |
|
pass |
|
|
|
return pd.DataFrame(data, columns=['title', 'discussion', 'result_sentence', 'result', 'text_url']) |
|
|
|
|
|
|
|
|
|
|
|
def html_to_plaintext(html_content): |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
for tag in soup.find_all(['p', 'li', 'dd', 'dl', 'ul']): |
|
tag.insert_before('\n') |
|
tag.insert_after('\n') |
|
for br in soup.find_all('br'): |
|
br.replace_with('\n') |
|
text = soup.get_text(separator=' ', strip=True) |
|
text = '\n'.join([line.strip() for line in text.splitlines() if line.strip()]) |
|
return text |
|
|
|
def split_text_into_sentences(text): |
|
seg = pysbd.Segmenter(language="el", clean=False) |
|
sentences = seg.segment(text) |
|
return ' '.join(sentences) |
|
|
|
def clean_discussion_text(text): |
|
return text.strip() |
|
|
|
def extract_outcome_from_text(text): |
|
outcomes = ['Διαγραφή', 'Παραμονή', 'διαγραφή', 'Συγχώνευση', 'Διατήρηση', 'Άλλο αποτέλεσμα'] |
|
lowered = text.lower() |
|
found_outcome = None |
|
for outcome in outcomes: |
|
if outcome.lower() in lowered: |
|
found_outcome = outcome |
|
break |
|
if not found_outcome: |
|
found_outcome = 'Δεν υπάρχει συναίνεση' |
|
return normalize_outcome(found_outcome) |
|
|
|
def extract_discussion_section(soup, title): |
|
t = title.replace(' ', '_') |
|
h2_tag = soup.find('h2', id=t) |
|
if not h2_tag: |
|
return '', '', '' |
|
|
|
heading_div = h2_tag.find_parent('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section') |
|
if not heading_div: |
|
return '', '', '' |
|
|
|
next_heading_div = heading_div.find_next('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section') |
|
|
|
html_fragments = [] |
|
current = heading_div.next_sibling |
|
while current and current != next_heading_div: |
|
if hasattr(current, 'prettify'): |
|
html_fragments.append(current.prettify()) |
|
else: |
|
html_fragments.append(str(current)) |
|
current = current.next_sibling |
|
|
|
discussion_html = ''.join(html_fragments).strip() |
|
if not discussion_html: |
|
return '', '', '' |
|
|
|
sub_soup = BeautifulSoup(discussion_html, 'html.parser') |
|
discussion_tags = sub_soup.find_all(['p', 'ul', 'dl']) |
|
|
|
if not discussion_tags: |
|
return '', '', '' |
|
|
|
cleaned_parts = [] |
|
for tag in discussion_tags: |
|
for unwanted in tag.find_all(['span', 'img', 'a', 'div', 'table'], recursive=True): |
|
unwanted.decompose() |
|
text = tag.get_text(separator=' ', strip=True) |
|
if text: |
|
cleaned_parts.append(text) |
|
|
|
cleaned_discussion = ' '.join(cleaned_parts) |
|
label = extract_outcome_from_text(cleaned_discussion) |
|
|
|
return discussion_html, label, cleaned_discussion |
|
|
|
def extract_fallback_discussion(url, title): |
|
response = requests.get(url) |
|
if response.status_code != 200: |
|
return '', None |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
discussion_tables = soup.find_all('table') |
|
if not discussion_tables: |
|
return '', None |
|
for table in discussion_tables: |
|
table_text = table.get_text(separator='\n', strip=True) |
|
if title in table_text: |
|
decision_row = table.find('tr') |
|
decision_cell = decision_row.find('th') if decision_row else None |
|
if decision_cell: |
|
result_match = re.search(r"Η συζήτηση τελείωσε, το αποτέλεσμα ήταν: <i>(.*?)</i>", str(decision_cell), re.DOTALL) |
|
result_sentence = result_match.group(1).strip() if result_match else "No result found" |
|
else: |
|
result_sentence = "No result found" |
|
|
|
discussion_row = decision_row.find_next_sibling('tr') if decision_row else None |
|
discussion_cell = discussion_row.find('td', class_='plainlinks') if discussion_row else None |
|
discussion_content = '' |
|
if discussion_cell: |
|
discussion_content = discussion_cell.get_text(separator=' ', strip=True) |
|
|
|
if discussion_content: |
|
outcome = extract_result(result_sentence) |
|
return discussion_content, outcome |
|
|
|
return '', None |
|
|
|
def extract_div_from_title_with_fallback(title, url ='', date=''): |
|
if not date: |
|
raise ValueError("For 'title' mode, 'date' must be provided in the format: mm/yyyy") |
|
|
|
month_map = { |
|
'01': 'Ιανουαρίου', '02': 'Φεβρουαρίου', '03': 'Μαρτίου', '04': 'Απριλίου', '05': 'Μαΐου', '06': 'Ιουνίου', |
|
'07': 'Ιουλίου', '08': 'Αυγούστου', '09': 'Σεπτεμβρίου', '10': 'Οκτωβρίου', '11': 'Νοεμβρίου', '12': 'Δεκεμβρίου' |
|
} |
|
if '_' in date and date.split('_')[0] in month_map.values(): |
|
|
|
date_str = date |
|
else: |
|
|
|
match = re.match(r'(\d{2})/(\d{4})', date) |
|
if not match: |
|
raise ValueError("Date must be in the format mm/yyyy or Month_Year") |
|
mm, yyyy = match.groups() |
|
if mm not in month_map: |
|
raise ValueError(f"Invalid month: {mm}") |
|
|
|
date_str = f"{month_map[mm]}_{yyyy}" |
|
base_url = 'https://el.wikipedia.org/wiki/Βικιπαίδεια:Σελίδες_για_διαγραφή' |
|
url = f"{base_url}/{date_str}#{title}" |
|
|
|
response = requests.get(url) |
|
if response.status_code != 200: |
|
return pd.DataFrame(columns=['title', 'discussion_url', 'discussion', 'outcome']) |
|
|
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
discussion_html, label, cleaned_discussion = extract_discussion_section(soup, title) |
|
|
|
text_url = f"{base_url}/{date_str}" |
|
discussion_url = text_url + '#' + title |
|
|
|
cleaned_discussion = html_to_plaintext(cleaned_discussion) |
|
cleaned_discussion = split_text_into_sentences(cleaned_discussion) |
|
cleaned_discussion = clean_discussion_text(cleaned_discussion) |
|
|
|
if not cleaned_discussion.strip(): |
|
fallback_url = f"{base_url}/{date_str}" |
|
discussion_content, outcome = extract_fallback_discussion(fallback_url, title) |
|
cleaned_discussion = html_to_plaintext(discussion_content) |
|
cleaned_discussion = split_text_into_sentences(cleaned_discussion) |
|
cleaned_discussion = clean_discussion_text(cleaned_discussion) |
|
if outcome: |
|
label = normalize_outcome(outcome) |
|
|
|
df = pd.DataFrame([[title, discussion_url, cleaned_discussion, label]], |
|
columns=['title', 'discussion_url', 'discussion', 'outcome']) |
|
return df |
|
|
|
def normalize_outcome(o): |
|
lowered = o.lower() |
|
if 'διαγρ' in lowered: |
|
return 'Διαγραφή' |
|
elif 'διατήρη' in lowered or 'παραμονή' in lowered: |
|
return 'Διατήρηση' |
|
elif 'συγχών' in lowered: |
|
return 'συγχώνευση' |
|
else: |
|
return 'Δεν υπάρχει συναίνεση' |
|
|
|
|
|
|
|
|
|
|
|
def collect_gr(mode='url', title='', url = '', years=[]): |
|
if mode not in ['title', 'year', 'url']: |
|
raise ValueError("mode must be either 'title' or 'year' or 'url'.") |
|
|
|
if mode == 'title': |
|
if not title or not years or len(years) != 1: |
|
raise ValueError("For 'title' mode, 'title' must be provided and 'years' must be a single-element list like ['mm/yyyy'].") |
|
date = years[0] |
|
df = extract_div_from_title_with_fallback(title, date=date) |
|
return df[['title', 'discussion_url', 'discussion', 'outcome']] |
|
|
|
elif mode == 'url': |
|
if title or years: |
|
raise ValueError("For 'url' mode, 'title' must be empty and 'years' must be empty.") |
|
|
|
match = re.search(r'Βικιπαίδεια:Σελίδες_για_διαγραφή/([^#]+)#(.+)', url) |
|
if not match: |
|
raise ValueError("URL format is incorrect.") |
|
date_str, title = match.groups() |
|
print(date_str, title) |
|
df = extract_div_from_title_with_fallback(title, date=date_str) |
|
return df[['title', 'discussion_url', 'discussion', 'outcome']] |
|
|
|
|
|
elif mode == 'year': |
|
if title or not years: |
|
raise ValueError("For 'year' mode, 'title' must be empty and 'years' must be provided.") |
|
if len(years) == 1: |
|
start_year = end_year = years[0] |
|
elif len(years) == 2: |
|
start_year, end_year = min(years), max(years) |
|
else: |
|
raise ValueError("Invalid years input. Provide one year or two years for a range.") |
|
|
|
all_data = [] |
|
for year in range(start_year, end_year + 1): |
|
url = f"https://el.wikipedia.org/wiki/Βικιπαίδεια:Σελίδες_για_διαγραφή/Ιανουαρίου_{year}" |
|
df = extract_discussions_from_page_collapsible(url) |
|
if df.empty: |
|
df = extract_discussions_from_page_non_collapsible(url) |
|
|
|
if not df.empty: |
|
df['result'] = df['result'].apply(normalize_outcome) |
|
df['discussion_url'] = df.apply(lambda row: row['text_url'] + '#' + row['title'].replace(' ', '_'), axis=1) |
|
df = df.rename(columns={'result':'outcome'}) |
|
all_data.append(df[['title', 'discussion_url', 'discussion', 'outcome']]) |
|
|
|
if all_data: |
|
return pd.concat(all_data, ignore_index=True) |
|
else: |
|
return pd.DataFrame(columns=['title', 'discussion_url', 'discussion', 'outcome']) |
|
|