{ "cells": [ { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# !pip install nest_asyncio \\\n", "# langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters \\\n", "# python-pptx==1.0.2 nltk==3.9.1 pymupdf lxml \\\n", "# sentence-transformers IProgress \\\n", "# huggingface_hub ipywidgets \\\n", "# qdrant-client langchain_experimental\n", "\n", "# !pip install sentence_transformers datasets pyarrow\n", "# !pip install torch\n", "# !pip install accelerate>=0.26.0\n", "# !pip install transformers\n", "# !pip install wandb\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "\n", "import nest_asyncio\n", "\n", "nest_asyncio.apply()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#!pip install -qU langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#!pip install -qU faiss-cpu python-pptx==1.0.2 nltk==3.9.1 pymupdf beautifulsoup4 lxml" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#!pip install -qU sentence-transformers\n", "#!pip install -qU IProgress\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import os\n", "import getpass\n", "\n", "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter Your OpenAI API Key: \")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "hf_username = getpass.getpass(\"Enter Your Hugging Face Username: \")\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "df7fbe16b4c44797abc886b87583af59", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
Display W&B run" ], "text/plain": [ "" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#!pip install wandb\n", "\n", "import wandb\n", "wandb.init(mode=\"disabled\")" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "# !pip install torch\n", "# !pip install accelerate>=0.26.0\n", "# !pip install transformers\n", "\n" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "#!pip install --upgrade --force-reinstall transformers accelerate torch\n", "#!which python\n", "\n" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [5/5 00:01, Epoch 5/5]\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StepTraining LossValidation LossCosine Accuracy@1Cosine Accuracy@3Cosine Accuracy@5Cosine Accuracy@10Cosine Precision@1Cosine Precision@3Cosine Precision@5Cosine Precision@10Cosine Recall@1Cosine Recall@3Cosine Recall@5Cosine Recall@10Cosine Ndcg@10Cosine Mrr@10Cosine Map@100
1No logNo log1.0000001.0000001.0000001.0000001.0000000.3333330.2000000.1000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
2No logNo log1.0000001.0000001.0000001.0000001.0000000.3333330.2000000.1000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
3No logNo log1.0000001.0000001.0000001.0000001.0000000.3333330.2000000.1000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
4No logNo log1.0000001.0000001.0000001.0000001.0000000.3333330.2000000.1000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
5No logNo log1.0000001.0000001.0000001.0000001.0000000.3333330.2000000.1000001.0000001.0000001.0000001.0000001.0000001.0000001.000000

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "warmup_steps = int(len(loader) * EPOCHS * 0.1)\n", "\n", "model.fit(\n", " train_objectives=[(loader, train_loss)],\n", " epochs=EPOCHS,\n", " warmup_steps=warmup_steps,\n", " output_path='models/midterm-compare-arctic-embed-m-ft',\n", " show_progress_bar=True,\n", " evaluator=evaluator,\n", " evaluation_steps=50\n", ")" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c3832f15349447c59ef0b7950d732a59", "version_major": 2, "version_minor": 0 }, "text/plain": [ "model.safetensors: 0%| | 0.00/436M [00:00