File size: 5,114 Bytes
0d0eac6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import os
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import random
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import fitz # PyMuPDF
from dotenv import load_dotenv
from huggingface_hub import login, HfApi
import traceback
# Add this near the top of your file, after the imports
load_dotenv()
login(token=os.getenv("HF_TOKEN"), add_to_git_credential=True)
# Step 1: Extract Text from PDFs
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
def chunk_text(text, themes):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_text(text)
thematic_chunks = {theme: [] for theme in themes}
thematic_chunks["Unclassified"] = [] # Add an "Unclassified" category
for chunk in chunks:
theme_found = False
for theme in themes:
if theme.lower() in chunk.lower():
thematic_chunks[theme].append(chunk)
theme_found = True
break
if not theme_found:
thematic_chunks["Unclassified"].append(chunk)
print("Chunks per theme:")
for theme, theme_chunks in thematic_chunks.items():
print(f" {theme}: {len(theme_chunks)}")
return thematic_chunks
# ... (same as in app.py)
# Function to generate synthetic fine-tuning data
def generate_synthetic_data(thematic_chunks, n_samples=1000):
examples = []
print(f"Total themes: {len(thematic_chunks)}")
for theme, chunks in thematic_chunks.items():
print(f"Theme: {theme}, Number of chunks: {len(chunks)}")
if not chunks:
print(f"Warning: No chunks for theme '{theme}'. Skipping this theme.")
continue
samples_per_theme = max(1, n_samples // len(thematic_chunks))
for _ in range(samples_per_theme):
chunk = random.choice(chunks)
question = f"What does this text say about {theme.lower()}?"
examples.append(InputExample(texts=[question, chunk]))
print(f"Total examples generated: {len(examples)}")
return examples
# Function to fine-tune the model
def fine_tune_model(model, train_examples, output_path):
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(model)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=3, warmup_steps=100, output_path=output_path)
return model
def main():
resources_folder = "resources"
themes = [
"Safe and Effective Systems",
"Algorithmic Discrimination Protections",
"Data Privacy",
"Notice and Explanation",
"Human Alternatives",
"Risk Management",
"Governance",
"Trustworthiness",
"Unclassified"
]
all_thematic_chunks = {}
for filename in os.listdir(resources_folder):
if filename.endswith(".pdf"):
pdf_path = os.path.join(resources_folder, filename)
text = extract_text_from_pdf(pdf_path)
thematic_chunks = chunk_text(text, themes)
all_thematic_chunks.update(thematic_chunks)
print(f"Processed {filename}")
# Fine-tune the model
base_model = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(base_model)
train_examples = generate_synthetic_data(all_thematic_chunks)
fine_tuned_model_path = "fine_tuned_embedding_model"
fine_tune_model(model, train_examples, fine_tuned_model_path)
print("Fine-tuning completed. Model saved locally.")
def upload_model_to_hub():
try:
# Load the fine-tuned model
fine_tuned_model_path = "fine_tuned_embedding_model"
model = SentenceTransformer(fine_tuned_model_path)
# Upload the fine-tuned model to Hugging Face Hub
repo_id = "svb01/fine-tuned-embedding-model"
print(f"Uploading model to existing repository: {repo_id}")
# Use HfApi to upload files directly
api = HfApi()
# Upload each file in the model directory
for root, _, files in os.walk(fine_tuned_model_path):
for file in files:
file_path = os.path.join(root, file)
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=file,
repo_id=repo_id,
commit_message=f"Upload {file}"
)
print("Fine-tuned model uploaded to Hugging Face Hub.")
except Exception as e:
print(f"Error uploading model to Hugging Face Hub: {str(e)}")
print("Detailed error information:")
print(traceback.format_exc())
if __name__ == "__main__":
# Uncomment the function you want to run
# main() # Run this for fine-tuning
upload_model_to_hub() # Run this to upload the model
|