gpt-wiki-az / collect_data.py
IsmatS's picture
Upload folder using huggingface_hub
463c2c1 verified
import wikipediaapi
import json
from tqdm import tqdm
import time
def get_wiki_pages(categories=["Azərbaycan tarixi", "Azərbaycan mədəniyyəti",
"Azərbaycan ədəbiyyatı", "Azərbaycan coğrafiyası"],
min_length=500, max_pages=1000):
"""
Recursively collect substantial Azerbaijani Wikipedia pages from multiple categories
"""
wiki = wikipediaapi.Wikipedia(
language='az',
extract_format=wikipediaapi.ExtractFormat.WIKI,
user_agent='AzGPTDataCollector/1.0'
)
collected_pages = {}
visited_pages = set()
def collect_pages(category_title):
if len(collected_pages) >= max_pages:
return
category = wiki.page(f"Kateqoriya:{category_title}")
if not category.exists():
print(f"Category not found: {category_title}")
return
# First, process all articles in this category
for member in category.categorymembers.values():
if len(collected_pages) >= max_pages:
return
if member.title in visited_pages:
continue
visited_pages.add(member.title)
# Skip if it's a category or template page
if member.title.startswith('Kateqoriya:') or member.title.startswith('Şablon:'):
continue
# Skip if content is too short
if len(member.text) < min_length:
continue
collected_pages[member.title] = {
'title': member.title,
'text': member.text,
'url': member.fullurl,
'length': len(member.text)
}
print(f"Collected: {member.title} ({len(member.text)} chars)")
# Delay to avoid hitting API limits
time.sleep(0.1)
# Then process subcategories
for subcategory in category.categorymembers.values():
if subcategory.title.startswith('Kateqoriya:'):
collect_pages(subcategory.title.replace('Kateqoriya:', ''))
# Start collection from each category
for category in categories:
print(f"\nStarting collection from category: {category}")
collect_pages(category)
return collected_pages
def preprocess_text(text):
"""
Enhanced text preprocessing for Azerbaijani text
"""
# Remove extra whitespace
text = ' '.join(text.split())
# Add space after punctuation if missing
for punct in '.!?،؛:()[]{}«»':
text = text.replace(punct, punct + ' ')
# Fix common OCR errors in Azerbaijani text
replacements = {
'i': 'ı', # Replace dotted i with dotless ı where appropriate
'І': 'I',
'...': '…',
}
for old, new in replacements.items():
text = text.replace(old, new)
return text
def save_dataset(pages, output_file='az_wiki_data.json'):
"""
Save collected pages to a JSON file
"""
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(pages, f, ensure_ascii=False, indent=2)
print(f"Saved {len(pages)} pages to {output_file}")
def main():
# Collect pages with minimum length requirement
print("Starting data collection...")
pages = get_wiki_pages(min_length=500, max_pages=100) # 500 chars minimum length
# Preprocess and save
print("\nPreprocessing and saving data...")
for title in pages:
pages[title]['text'] = preprocess_text(pages[title]['text'])
save_dataset(pages)
# Print statistics
total_chars = sum(page['length'] for page in pages.values())
if pages:
print(f"\nCollection complete!")
print(f"Total pages: {len(pages)}")
print(f"Total characters: {total_chars}")
print(f"Average page length: {total_chars / len(pages):.2f} characters")
# Print some titles as examples
print("\nSample of collected articles:")
for title in list(pages.keys())[:5]:
print(f"- {title} ({pages[title]['length']} chars)")
if __name__ == "__main__":
main()