import wikipediaapi |
import json |
from tqdm import tqdm |
import time |
def get_wiki_pages(categories=["Azərbaycan tarixi", "Azərbaycan mədəniyyəti", |
"Azərbaycan ədəbiyyatı", "Azərbaycan coğrafiyası"], |
min_length=500, max_pages=1000): |
""" |
Recursively collect substantial Azerbaijani Wikipedia pages from multiple categories |
""" |
wiki = wikipediaapi.Wikipedia( |
language='az', |
extract_format=wikipediaapi.ExtractFormat.WIKI, |
user_agent='AzGPTDataCollector/1.0' |
) |
collected_pages = {} |
visited_pages = set() |
def collect_pages(category_title): |
if len(collected_pages) >= max_pages: |
return |
category = wiki.page(f"Kateqoriya:{category_title}") |
if not category.exists(): |
print(f"Category not found: {category_title}") |
return |
for member in category.categorymembers.values(): |
if len(collected_pages) >= max_pages: |
return |
if member.title in visited_pages: |
continue |
visited_pages.add(member.title) |
if member.title.startswith('Kateqoriya:') or member.title.startswith('Şablon:'): |
continue |
if len(member.text) < min_length: |
continue |
collected_pages[member.title] = { |
'title': member.title, |
'text': member.text, |
'url': member.fullurl, |
'length': len(member.text) |
} |
print(f"Collected: {member.title} ({len(member.text)} chars)") |
time.sleep(0.1) |
for subcategory in category.categorymembers.values(): |
if subcategory.title.startswith('Kateqoriya:'): |
collect_pages(subcategory.title.replace('Kateqoriya:', '')) |
for category in categories: |
print(f"\nStarting collection from category: {category}") |
collect_pages(category) |
return collected_pages |
def preprocess_text(text): |
""" |
Enhanced text preprocessing for Azerbaijani text |
""" |
text = ' '.join(text.split()) |
for punct in '.!?،؛:()[]{}«»': |
text = text.replace(punct, punct + ' ') |
replacements = { |
'i': 'ı', |
'І': 'I', |
'...': '…', |
} |
for old, new in replacements.items(): |
text = text.replace(old, new) |
return text |
def save_dataset(pages, output_file='az_wiki_data.json'): |
""" |
Save collected pages to a JSON file |
""" |
with open(output_file, 'w', encoding='utf-8') as f: |
json.dump(pages, f, ensure_ascii=False, indent=2) |
print(f"Saved {len(pages)} pages to {output_file}") |
def main(): |
print("Starting data collection...") |
pages = get_wiki_pages(min_length=500, max_pages=100) |
print("\nPreprocessing and saving data...") |
for title in pages: |
pages[title]['text'] = preprocess_text(pages[title]['text']) |
save_dataset(pages) |
total_chars = sum(page['length'] for page in pages.values()) |
if pages: |
print(f"\nCollection complete!") |
print(f"Total pages: {len(pages)}") |
print(f"Total characters: {total_chars}") |
print(f"Average page length: {total_chars / len(pages):.2f} characters") |
print("\nSample of collected articles:") |
for title in list(pages.keys())[:5]: |
print(f"- {title} ({pages[title]['length']} chars)") |
if __name__ == "__main__": |
main() |