Spaces:

userlocallm
/

MTC

Sleeping

App Files Files Community

MTC / src /create_database.py

userlocallm

Upload 17 files

500516e verified 5 months ago

raw

history blame contribute delete

3.35 kB

	# src/create_database.py
	import os
	import json
	from src.memory import MemoryManager # Corrected import path
	import logging
	from typing import List, Dict

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	def parse_data_update(file_path: str, keyword_dir: str) -> List[Dict[str, str]]:
	if not os.path.exists(file_path):
	logging.error(f"File not found: {file_path}")
	return []

	with open(file_path, 'r') as file_obj:
	content = file_obj.read()

	content = content.lower() # Normalize to lowercase

	sections = []
	lines = content.split('\n')
	current_section = None
	current_content = []

	for line in lines:
	if line.strip().startswith("chronique #") or line.strip().startswith("flash info fl-") or line.strip().startswith("chronique-faq #"):
	if current_section:
	sections.append({
	"concept": current_section,
	"description": "\n".join(current_content)
	})
	logging.info(f"Parsed section: {current_section}")
	current_section = line.strip()
	current_content = []
	else:
	current_content.append(line)

	if current_section:
	sections.append({
	"concept": current_section,
	"description": "\n".join(current_content)
	})
	logging.info(f"Parsed section: {current_section}")

	return sections

	def get_keywords(number: str, keyword_dir: str) -> List[str]:
	keyword_file = os.path.join(keyword_dir, f"FL-{number}-KEYWORD.txt")
	if not os.path.exists(keyword_file):
	keyword_file = os.path.join(keyword_dir, f"INFO-{number}-KEYWORD.txt")
	if not os.path.exists(keyword_file):
	keyword_file = os.path.join(keyword_dir, f"CHRONIQUE{number}-KEYWORD.txt")
	if not os.path.exists(keyword_file):
	logging.warning(f"Keyword file not found: {keyword_file}")
	return []

	with open(keyword_file, 'r') as file_obj:
	content = file_obj.read()
	if 'KEYWORD = ' in content:
	content = content.split('KEYWORD = ')[1]
	tags = content.split(', ')
	tags = [tag.strip() for tag in tags if tag.strip()] # Remove empty tags
	logging.info(f"Keywords for {number}: {tags}")
	return tags

	def load_and_process_dataset(data_update_path: str, keyword_dir: str, db_path: str):
	memory_manager = MemoryManager(db_path)

	sections = parse_data_update(data_update_path, keyword_dir)
	for section in sections:
	concept = section['concept']
	description = section['description']
	number = concept.split('#')[1].split()[0] # Extract the number from the concept
	tags = get_keywords(number, keyword_dir)

	# Check if the section already exists in the database
	if not memory_manager.section_exists(concept):
	memory_manager.add_semantic_memory(concept, description, tags=tags)
	logging.info(f"Added section: {concept}")
	else:
	logging.info(f"Section already exists: {concept}")

	if __name__ == "__main__":
	data_update_path = "data-update.txt"
	keyword_dir = "keyword" # Updated keyword directory
	db_path = "agent.db"
	load_and_process_dataset(data_update_path, keyword_dir, db_path)