Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files
App_Function_Libraries/MediaWiki/Media_Wiki.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Media_Wiki.py
|
| 2 |
+
# Description: This file contains the functions to import MediaWiki dumps into the media_db and Chroma databases.
|
| 3 |
+
#######################################################################################################################
|
| 4 |
+
#
|
| 5 |
+
# Imports
|
| 6 |
+
import json
|
| 7 |
+
import logging
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
from typing import List, Dict, Any, Iterator, Optional
|
| 11 |
+
# 3rd-Party Imports
|
| 12 |
+
import mwparserfromhell
|
| 13 |
+
import mwxml
|
| 14 |
+
import yaml
|
| 15 |
+
#
|
| 16 |
+
# Local Imports
|
| 17 |
+
from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords, check_media_exists
|
| 18 |
+
from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content
|
| 19 |
+
#
|
| 20 |
+
#######################################################################################################################
|
| 21 |
+
#
|
| 22 |
+
# Functions:
|
| 23 |
+
|
| 24 |
+
def setup_logger(name: str, level: int = logging.INFO, log_file: Optional[str] = None) -> logging.Logger:
|
| 25 |
+
"""Set up and return a logger with the given name and level."""
|
| 26 |
+
logger = logging.getLogger(name)
|
| 27 |
+
logger.setLevel(level)
|
| 28 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 29 |
+
|
| 30 |
+
if log_file:
|
| 31 |
+
file_handler = logging.FileHandler(log_file)
|
| 32 |
+
file_handler.setFormatter(formatter)
|
| 33 |
+
logger.addHandler(file_handler)
|
| 34 |
+
|
| 35 |
+
console_handler = logging.StreamHandler()
|
| 36 |
+
console_handler.setFormatter(formatter)
|
| 37 |
+
logger.addHandler(console_handler)
|
| 38 |
+
|
| 39 |
+
return logger
|
| 40 |
+
|
| 41 |
+
# Usage
|
| 42 |
+
logger = setup_logger('mediawiki_import', log_file='mediawiki_import.log')
|
| 43 |
+
|
| 44 |
+
# Load configuration
|
| 45 |
+
def load_mediawiki_import_config():
|
| 46 |
+
with open(os.path.join('Config_Files', 'mediawiki_import_config.yaml'), 'r') as f:
|
| 47 |
+
return yaml.safe_load(f)
|
| 48 |
+
config = load_mediawiki_import_config()
|
| 49 |
+
|
| 50 |
+
def parse_mediawiki_dump(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> Iterator[
|
| 51 |
+
Dict[str, Any]]:
|
| 52 |
+
dump = mwxml.Dump.from_file(open(file_path, encoding='utf-8'))
|
| 53 |
+
for page in dump.pages:
|
| 54 |
+
if skip_redirects and page.redirect:
|
| 55 |
+
continue
|
| 56 |
+
if namespaces and page.namespace not in namespaces:
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
for revision in page:
|
| 60 |
+
code = mwparserfromhell.parse(revision.text)
|
| 61 |
+
text = code.strip_code(normalize=True, collapse=True, keep_template_params=False)
|
| 62 |
+
yield {
|
| 63 |
+
"title": page.title,
|
| 64 |
+
"content": text,
|
| 65 |
+
"namespace": page.namespace,
|
| 66 |
+
"page_id": page.id,
|
| 67 |
+
"revision_id": revision.id,
|
| 68 |
+
"timestamp": revision.timestamp
|
| 69 |
+
}
|
| 70 |
+
logger.debug(f"Yielded page: {page.title}")
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def optimized_chunking(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
|
| 74 |
+
sections = re.split(r'\n==\s*(.*?)\s*==\n', text)
|
| 75 |
+
chunks = []
|
| 76 |
+
current_chunk = ""
|
| 77 |
+
current_size = 0
|
| 78 |
+
|
| 79 |
+
for i in range(0, len(sections), 2):
|
| 80 |
+
section_title = sections[i] if i > 0 else "Introduction"
|
| 81 |
+
section_content = sections[i + 1] if i + 1 < len(sections) else ""
|
| 82 |
+
|
| 83 |
+
if current_size + len(section_content) > chunk_options['max_size']:
|
| 84 |
+
if current_chunk:
|
| 85 |
+
chunks.append({"text": current_chunk, "metadata": {"section": section_title}})
|
| 86 |
+
current_chunk = section_content
|
| 87 |
+
current_size = len(section_content)
|
| 88 |
+
else:
|
| 89 |
+
current_chunk += f"\n== {section_title} ==\n" + section_content
|
| 90 |
+
current_size += len(section_content)
|
| 91 |
+
|
| 92 |
+
if current_chunk:
|
| 93 |
+
chunks.append({"text": current_chunk, "metadata": {"section": "End"}})
|
| 94 |
+
|
| 95 |
+
return chunks
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def process_single_item(content: str, title: str, wiki_name: str, chunk_options: Dict[str, Any],
|
| 99 |
+
is_combined: bool = False, item: Dict[str, Any] = None):
|
| 100 |
+
try:
|
| 101 |
+
url = f"mediawiki:{wiki_name}" if is_combined else f"mediawiki:{wiki_name}:{title}"
|
| 102 |
+
|
| 103 |
+
if not check_media_exists(title, url):
|
| 104 |
+
media_id = add_media_with_keywords(
|
| 105 |
+
url=url,
|
| 106 |
+
title=title,
|
| 107 |
+
media_type="mediawiki_dump" if is_combined else "mediawiki_article",
|
| 108 |
+
content=content,
|
| 109 |
+
keywords=f"mediawiki,{wiki_name}" + (",full_dump" if is_combined else ",article"),
|
| 110 |
+
prompt="",
|
| 111 |
+
summary="",
|
| 112 |
+
transcription_model="",
|
| 113 |
+
author="MediaWiki",
|
| 114 |
+
ingestion_date=item['timestamp'].strftime('%Y-%m-%d') if item else None
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
chunks = optimized_chunking(content, chunk_options)
|
| 118 |
+
for chunk in chunks:
|
| 119 |
+
process_and_store_content(chunk['text'], f"mediawiki_{wiki_name}", media_id, title)
|
| 120 |
+
logger.info(f"Successfully processed item: {title}")
|
| 121 |
+
else:
|
| 122 |
+
logger.info(f"Skipping existing article: {title}")
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.error(f"Error processing item {title}: {str(e)}")
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def load_checkpoint(file_path: str) -> int:
|
| 128 |
+
if os.path.exists(file_path):
|
| 129 |
+
with open(file_path, 'r') as f:
|
| 130 |
+
return json.load(f)['last_processed_id']
|
| 131 |
+
return 0
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def save_checkpoint(file_path: str, last_processed_id: int):
|
| 135 |
+
with open(file_path, 'w') as f:
|
| 136 |
+
json.dump({'last_processed_id': last_processed_id}, f)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def import_mediawiki_dump(
|
| 140 |
+
file_path: str,
|
| 141 |
+
wiki_name: str,
|
| 142 |
+
namespaces: List[int] = None,
|
| 143 |
+
skip_redirects: bool = False,
|
| 144 |
+
chunk_options: Dict[str, Any] = None,
|
| 145 |
+
single_item: bool = False,
|
| 146 |
+
progress_callback: Any = None
|
| 147 |
+
) -> Iterator[str]:
|
| 148 |
+
try:
|
| 149 |
+
if chunk_options is None:
|
| 150 |
+
chunk_options = config['chunking']
|
| 151 |
+
|
| 152 |
+
checkpoint_file = f"{wiki_name}_import_checkpoint.json"
|
| 153 |
+
last_processed_id = load_checkpoint(checkpoint_file)
|
| 154 |
+
|
| 155 |
+
total_pages = count_pages(file_path, namespaces, skip_redirects)
|
| 156 |
+
processed_pages = 0
|
| 157 |
+
|
| 158 |
+
yield f"Found {total_pages} pages to process."
|
| 159 |
+
|
| 160 |
+
for item in parse_mediawiki_dump(file_path, namespaces, skip_redirects):
|
| 161 |
+
if item['page_id'] <= last_processed_id:
|
| 162 |
+
continue
|
| 163 |
+
process_single_item(item['content'], item['title'], wiki_name, chunk_options, False, item)
|
| 164 |
+
save_checkpoint(checkpoint_file, item['page_id'])
|
| 165 |
+
processed_pages += 1
|
| 166 |
+
if progress_callback is not None:
|
| 167 |
+
progress_callback(processed_pages / total_pages, f"Processed page: {item['title']}")
|
| 168 |
+
yield f"Processed page {processed_pages}/{total_pages}: {item['title']}"
|
| 169 |
+
|
| 170 |
+
os.remove(checkpoint_file) # Remove checkpoint file after successful import
|
| 171 |
+
yield f"Successfully imported and indexed MediaWiki dump: {wiki_name}"
|
| 172 |
+
except FileNotFoundError:
|
| 173 |
+
logger.error(f"MediaWiki dump file not found: {file_path}")
|
| 174 |
+
yield f"Error: File not found - {file_path}"
|
| 175 |
+
except PermissionError:
|
| 176 |
+
logger.error(f"Permission denied when trying to read: {file_path}")
|
| 177 |
+
yield f"Error: Permission denied - {file_path}"
|
| 178 |
+
except Exception as e:
|
| 179 |
+
logger.exception(f"Error during MediaWiki import: {str(e)}")
|
| 180 |
+
yield f"Error during import: {str(e)}"
|
| 181 |
+
|
| 182 |
+
def count_pages(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> int:
|
| 183 |
+
"""
|
| 184 |
+
Count the number of pages in a MediaWiki XML dump file.
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
file_path (str): Path to the MediaWiki XML dump file.
|
| 188 |
+
namespaces (List[int], optional): List of namespace IDs to include. If None, include all namespaces.
|
| 189 |
+
skip_redirects (bool, optional): Whether to skip redirect pages.
|
| 190 |
+
|
| 191 |
+
Returns:
|
| 192 |
+
int: The number of pages in the dump file.
|
| 193 |
+
"""
|
| 194 |
+
try:
|
| 195 |
+
dump = mwxml.Dump.from_file(open(file_path, encoding='utf-8'))
|
| 196 |
+
count = 0
|
| 197 |
+
for page in dump.pages:
|
| 198 |
+
if skip_redirects and page.redirect:
|
| 199 |
+
continue
|
| 200 |
+
if namespaces and page.namespace not in namespaces:
|
| 201 |
+
continue
|
| 202 |
+
count += 1
|
| 203 |
+
return count
|
| 204 |
+
except Exception as e:
|
| 205 |
+
logger.error(f"Error counting pages in MediaWiki dump: {str(e)}")
|
| 206 |
+
return 0
|
| 207 |
+
|
| 208 |
+
#
|
| 209 |
+
# End of Media_Wiki.py
|
| 210 |
+
#######################################################################################################################
|
App_Function_Libraries/MediaWiki/Media_Wiki_Tests.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Media_Wiki_Tests.py
|
| 2 |
+
# Description: Unit tests for the Media_Wiki module.
|
| 3 |
+
#
|
| 4 |
+
# Usage:
|
| 5 |
+
# pip install pytest pytest-asyncio
|
| 6 |
+
# pytest Media_Wiki_Tests.py
|
| 7 |
+
#
|
| 8 |
+
# Imports
|
| 9 |
+
import pytest
|
| 10 |
+
import asyncio
|
| 11 |
+
from unittest.mock import patch, MagicMock
|
| 12 |
+
# Local Imports
|
| 13 |
+
from Media_Wiki import parse_mediawiki_dump, optimized_chunking, process_single_item, import_mediawiki_dump, load_mediawiki_import_config
|
| 14 |
+
#
|
| 15 |
+
# #######################################################################################################################
|
| 16 |
+
#
|
| 17 |
+
# Functions:
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@pytest.fixture(scope="module")
|
| 22 |
+
def event_loop():
|
| 23 |
+
loop = asyncio.get_event_loop_policy().new_event_loop()
|
| 24 |
+
yield loop
|
| 25 |
+
loop.close()
|
| 26 |
+
|
| 27 |
+
@pytest.fixture
|
| 28 |
+
def mock_mwxml_dump():
|
| 29 |
+
mock_dump = MagicMock()
|
| 30 |
+
mock_page = MagicMock()
|
| 31 |
+
mock_page.title = "Test Page"
|
| 32 |
+
mock_page.namespace = 0
|
| 33 |
+
mock_page.id = 1
|
| 34 |
+
mock_revision = MagicMock()
|
| 35 |
+
mock_revision.id = 1
|
| 36 |
+
mock_revision.timestamp = "2021-01-01T00:00:00Z"
|
| 37 |
+
mock_revision.text = "Test content"
|
| 38 |
+
mock_page.revisions = [mock_revision]
|
| 39 |
+
mock_dump.pages = [mock_page]
|
| 40 |
+
return mock_dump
|
| 41 |
+
|
| 42 |
+
def test_parse_mediawiki_dump(mock_mwxml_dump):
|
| 43 |
+
with patch('mwxml.Dump.from_file', return_value=mock_mwxml_dump), \
|
| 44 |
+
patch('mwparserfromhell.parse') as mock_parse:
|
| 45 |
+
mock_parse.return_value.strip_code.return_value = "Stripped content"
|
| 46 |
+
result = list(parse_mediawiki_dump("dummy_path"))
|
| 47 |
+
assert len(result) == 1
|
| 48 |
+
assert result[0]['title'] == "Test Page"
|
| 49 |
+
assert result[0]['content'] == "Stripped content"
|
| 50 |
+
assert result[0]['namespace'] == 0
|
| 51 |
+
assert result[0]['page_id'] == 1
|
| 52 |
+
assert result[0]['revision_id'] == 1
|
| 53 |
+
|
| 54 |
+
def test_optimized_chunking():
|
| 55 |
+
test_text = "== Section 1 ==\nContent 1\n== Section 2 ==\nContent 2"
|
| 56 |
+
chunk_options = {'max_size': 50}
|
| 57 |
+
result = optimized_chunking(test_text, chunk_options)
|
| 58 |
+
assert len(result) == 2
|
| 59 |
+
assert result[0]['text'].startswith("== Section 1 ==")
|
| 60 |
+
assert result[1]['text'].startswith("== Section 2 ==")
|
| 61 |
+
assert 'metadata' in result[0] and 'section' in result[0]['metadata']
|
| 62 |
+
|
| 63 |
+
@pytest.mark.asyncio
|
| 64 |
+
async def test_process_single_item():
|
| 65 |
+
with patch('Media_Wiki.check_media_exists', return_value=False), \
|
| 66 |
+
patch('Media_Wiki.add_media_with_keywords', return_value=1), \
|
| 67 |
+
patch('Media_Wiki.process_and_store_content') as mock_process_store:
|
| 68 |
+
await process_single_item("Test content", "Test Title", "TestWiki", {'max_size': 100})
|
| 69 |
+
mock_process_store.assert_called()
|
| 70 |
+
# Add more detailed assertions here
|
| 71 |
+
|
| 72 |
+
@pytest.mark.asyncio
|
| 73 |
+
async def test_import_mediawiki_dump():
|
| 74 |
+
with patch('Media_Wiki.parse_mediawiki_dump') as mock_parse, \
|
| 75 |
+
patch('Media_Wiki.process_single_item') as mock_process, \
|
| 76 |
+
patch('Media_Wiki.load_checkpoint', return_value=0), \
|
| 77 |
+
patch('Media_Wiki.save_checkpoint'), \
|
| 78 |
+
patch('os.remove'):
|
| 79 |
+
mock_parse.return_value = [{'page_id': 1, 'title': 'Test', 'content': 'Content'}]
|
| 80 |
+
result = await import_mediawiki_dump("dummy_path", "TestWiki")
|
| 81 |
+
assert "Successfully imported" in result
|
| 82 |
+
mock_process.assert_called_once()
|
| 83 |
+
|
| 84 |
+
def test_import_mediawiki_dump_file_not_found():
|
| 85 |
+
with patch('Media_Wiki.parse_mediawiki_dump', side_effect=FileNotFoundError):
|
| 86 |
+
result = asyncio.run(import_mediawiki_dump("non_existent_path", "TestWiki"))
|
| 87 |
+
assert "Error: File not found" in result
|
| 88 |
+
|
| 89 |
+
def test_load_mediawiki_import_config():
|
| 90 |
+
with patch('builtins.open', MagicMock()):
|
| 91 |
+
with patch('yaml.safe_load', return_value={'test_key': 'test_value'}):
|
| 92 |
+
config = load_mediawiki_import_config()
|
| 93 |
+
assert 'test_key' in config
|
| 94 |
+
assert config['test_key'] == 'test_value'
|
App_Function_Libraries/MediaWiki/mediawiki_import_config.yaml
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MediaWiki Import Configuration
|
| 2 |
+
|
| 3 |
+
# Database settings
|
| 4 |
+
database:
|
| 5 |
+
sqlite_path: './Databases/media_summary.db'
|
| 6 |
+
chroma_db_path: 'chroma_db'
|
| 7 |
+
|
| 8 |
+
# Chunking options
|
| 9 |
+
chunking:
|
| 10 |
+
default_method: 'sentences'
|
| 11 |
+
default_size: 1000
|
| 12 |
+
default_overlap: 100
|
| 13 |
+
adaptive: true
|
| 14 |
+
language: 'en'
|
| 15 |
+
methods:
|
| 16 |
+
- 'sentences'
|
| 17 |
+
- 'words'
|
| 18 |
+
- 'paragraphs'
|
| 19 |
+
- 'tokens'
|
| 20 |
+
|
| 21 |
+
# Import settings
|
| 22 |
+
import:
|
| 23 |
+
batch_size: 1000 # Number of pages to process in a single batch
|
| 24 |
+
default_skip_redirects: true
|
| 25 |
+
default_namespaces: [0] # Main namespace by default
|
| 26 |
+
single_item_default: false
|
| 27 |
+
|
| 28 |
+
# Processing options
|
| 29 |
+
processing:
|
| 30 |
+
max_workers: 4 # Number of worker threads for async processing
|
| 31 |
+
|
| 32 |
+
# Embedding settings
|
| 33 |
+
embeddings:
|
| 34 |
+
provider: 'openai' # or 'local' or 'huggingface'
|
| 35 |
+
model: 'text-embedding-ada-002'
|
| 36 |
+
api_key: 'your_openai_api_key_here' # Remove if using local embeddings
|
| 37 |
+
local_url: 'http://localhost:8080/embeddings' # Only for local embeddings
|
| 38 |
+
|
| 39 |
+
# ChromaDB settings
|
| 40 |
+
chromadb:
|
| 41 |
+
collection_prefix: 'mediawiki_'
|
| 42 |
+
|
| 43 |
+
# Logging settings
|
| 44 |
+
logging:
|
| 45 |
+
level: 'INFO'
|
| 46 |
+
file: 'mediawiki_import.log'
|
| 47 |
+
|
| 48 |
+
# Checkpoint settings
|
| 49 |
+
checkpoints:
|
| 50 |
+
enabled: true
|
| 51 |
+
directory: 'import_checkpoints'
|
| 52 |
+
|
| 53 |
+
# Error handling
|
| 54 |
+
error_handling:
|
| 55 |
+
max_retries: 3
|
| 56 |
+
retry_delay: 5 # seconds
|
| 57 |
+
|
| 58 |
+
# User interface settings
|
| 59 |
+
ui:
|
| 60 |
+
default_chunk_size: 1000
|
| 61 |
+
min_chunk_size: 100
|
| 62 |
+
max_chunk_size: 2000
|
| 63 |
+
default_chunk_overlap: 100
|