|
import wikipediaapi |
|
import json |
|
from tqdm import tqdm |
|
import time |
|
|
|
def get_wiki_pages(categories=["Azərbaycan tarixi", "Azərbaycan mədəniyyəti", |
|
"Azərbaycan ədəbiyyatı", "Azərbaycan coğrafiyası"], |
|
min_length=500, max_pages=1000): |
|
""" |
|
Recursively collect substantial Azerbaijani Wikipedia pages from multiple categories |
|
""" |
|
wiki = wikipediaapi.Wikipedia( |
|
language='az', |
|
extract_format=wikipediaapi.ExtractFormat.WIKI, |
|
user_agent='AzGPTDataCollector/1.0' |
|
) |
|
|
|
collected_pages = {} |
|
visited_pages = set() |
|
|
|
def collect_pages(category_title): |
|
if len(collected_pages) >= max_pages: |
|
return |
|
|
|
category = wiki.page(f"Kateqoriya:{category_title}") |
|
if not category.exists(): |
|
print(f"Category not found: {category_title}") |
|
return |
|
|
|
|
|
for member in category.categorymembers.values(): |
|
if len(collected_pages) >= max_pages: |
|
return |
|
|
|
if member.title in visited_pages: |
|
continue |
|
|
|
visited_pages.add(member.title) |
|
|
|
|
|
if member.title.startswith('Kateqoriya:') or member.title.startswith('Şablon:'): |
|
continue |
|
|
|
|
|
if len(member.text) < min_length: |
|
continue |
|
|
|
collected_pages[member.title] = { |
|
'title': member.title, |
|
'text': member.text, |
|
'url': member.fullurl, |
|
'length': len(member.text) |
|
} |
|
print(f"Collected: {member.title} ({len(member.text)} chars)") |
|
|
|
|
|
time.sleep(0.1) |
|
|
|
|
|
for subcategory in category.categorymembers.values(): |
|
if subcategory.title.startswith('Kateqoriya:'): |
|
collect_pages(subcategory.title.replace('Kateqoriya:', '')) |
|
|
|
|
|
for category in categories: |
|
print(f"\nStarting collection from category: {category}") |
|
collect_pages(category) |
|
|
|
return collected_pages |
|
|
|
def preprocess_text(text): |
|
""" |
|
Enhanced text preprocessing for Azerbaijani text |
|
""" |
|
|
|
text = ' '.join(text.split()) |
|
|
|
|
|
for punct in '.!?،؛:()[]{}«»': |
|
text = text.replace(punct, punct + ' ') |
|
|
|
|
|
replacements = { |
|
'i': 'ı', |
|
'І': 'I', |
|
'...': '…', |
|
} |
|
for old, new in replacements.items(): |
|
text = text.replace(old, new) |
|
|
|
return text |
|
|
|
def save_dataset(pages, output_file='az_wiki_data.json'): |
|
""" |
|
Save collected pages to a JSON file |
|
""" |
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
json.dump(pages, f, ensure_ascii=False, indent=2) |
|
print(f"Saved {len(pages)} pages to {output_file}") |
|
|
|
def main(): |
|
|
|
print("Starting data collection...") |
|
pages = get_wiki_pages(min_length=500, max_pages=100) |
|
|
|
|
|
print("\nPreprocessing and saving data...") |
|
for title in pages: |
|
pages[title]['text'] = preprocess_text(pages[title]['text']) |
|
|
|
save_dataset(pages) |
|
|
|
|
|
total_chars = sum(page['length'] for page in pages.values()) |
|
if pages: |
|
print(f"\nCollection complete!") |
|
print(f"Total pages: {len(pages)}") |
|
print(f"Total characters: {total_chars}") |
|
print(f"Average page length: {total_chars / len(pages):.2f} characters") |
|
|
|
|
|
print("\nSample of collected articles:") |
|
for title in list(pages.keys())[:5]: |
|
print(f"- {title} ({pages[title]['length']} chars)") |
|
|
|
if __name__ == "__main__": |
|
main() |