Spaces:

drewgenai
/

midterm_poc

Sleeping

App Files Files Community

drewgenai commited on 27 days ago

Commit

c25f92a

1 Parent(s): d35bb79

readme

Browse files

Files changed (14) hide show

.gitattributes +35 -0
.gitignore +9 -0
01-cleanragcsv.ipynb +686 -0
02-testembedtune copy.ipynb +1282 -0
03-testembedtune.ipynb +1861 -0
Dockerfile +30 -0
README.md +12 -0
app.py +229 -0
chainlit.md +2 -0
example_files/florida_protocol.pdf +0 -0
example_files/matching_data_elements.csv +7 -0
example_files/wyoming_protocol.pdf +0 -0
pyproject.toml +57 -0
uv.lock +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+__pycache__/
+.chainlit/
+.venv/
+.env
+/output/
+/upload/
+*.jsonl
+/models/
+*z*.py

01-cleanragcsv.ipynb ADDED Viewed

	@@ -0,0 +1,686 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: nest_asyncio in ./.venv/lib/python3.13/site-packages (1.6.0)\n",
+      "Requirement already satisfied: langchain_openai in ./.venv/lib/python3.13/site-packages (0.3.6)\n",
+      "Requirement already satisfied: langchain_huggingface in ./.venv/lib/python3.13/site-packages (0.1.2)\n",
+      "Requirement already satisfied: langchain_core in ./.venv/lib/python3.13/site-packages (0.3.37)\n",
+      "Requirement already satisfied: langchain in ./.venv/lib/python3.13/site-packages (0.3.19)\n",
+      "Requirement already satisfied: langchain_community in ./.venv/lib/python3.13/site-packages (0.3.18)\n",
+      "Requirement already satisfied: langchain-text-splitters in ./.venv/lib/python3.13/site-packages (0.3.6)\n",
+      "Requirement already satisfied: faiss-cpu in ./.venv/lib/python3.13/site-packages (1.10.0)\n",
+      "Requirement already satisfied: python-pptx==1.0.2 in ./.venv/lib/python3.13/site-packages (1.0.2)\n",
+      "Requirement already satisfied: nltk==3.9.1 in ./.venv/lib/python3.13/site-packages (3.9.1)\n",
+      "Requirement already satisfied: pymupdf in ./.venv/lib/python3.13/site-packages (1.25.3)\n",
+      "Requirement already satisfied: beautifulsoup4 in ./.venv/lib/python3.13/site-packages (4.13.3)\n",
+      "Requirement already satisfied: lxml in ./.venv/lib/python3.13/site-packages (5.3.1)\n",
+      "Requirement already satisfied: sentence-transformers in ./.venv/lib/python3.13/site-packages (3.4.1)\n",
+      "Requirement already satisfied: IProgress in ./.venv/lib/python3.13/site-packages (0.4)\n",
+      "Requirement already satisfied: huggingface_hub in ./.venv/lib/python3.13/site-packages (0.29.1)\n",
+      "Requirement already satisfied: ipywidgets in ./.venv/lib/python3.13/site-packages (8.1.5)\n",
+      "Requirement already satisfied: qdrant-client in ./.venv/lib/python3.13/site-packages (1.13.2)\n",
+      "Requirement already satisfied: Pillow>=3.3.2 in ./.venv/lib/python3.13/site-packages (from python-pptx==1.0.2) (11.1.0)\n",
+      "Requirement already satisfied: XlsxWriter>=0.5.7 in ./.venv/lib/python3.13/site-packages (from python-pptx==1.0.2) (3.2.2)\n",
+      "Requirement already satisfied: typing-extensions>=4.9.0 in ./.venv/lib/python3.13/site-packages (from python-pptx==1.0.2) (4.12.2)\n",
+      "Requirement already satisfied: click in ./.venv/lib/python3.13/site-packages (from nltk==3.9.1) (8.1.8)\n",
+      "Requirement already satisfied: joblib in ./.venv/lib/python3.13/site-packages (from nltk==3.9.1) (1.4.2)\n",
+      "Requirement already satisfied: regex>=2021.8.3 in ./.venv/lib/python3.13/site-packages (from nltk==3.9.1) (2024.11.6)\n",
+      "Requirement already satisfied: tqdm in ./.venv/lib/python3.13/site-packages (from nltk==3.9.1) (4.67.1)\n",
+      "Requirement already satisfied: openai<2.0.0,>=1.58.1 in ./.venv/lib/python3.13/site-packages (from langchain_openai) (1.63.2)\n",
+      "Requirement already satisfied: tiktoken<1,>=0.7 in ./.venv/lib/python3.13/site-packages (from langchain_openai) (0.9.0)\n",
+      "Requirement already satisfied: tokenizers>=0.19.1 in ./.venv/lib/python3.13/site-packages (from langchain_huggingface) (0.21.0)\n",
+      "Requirement already satisfied: transformers>=4.39.0 in ./.venv/lib/python3.13/site-packages (from langchain_huggingface) (4.49.0)\n",
+      "Requirement already satisfied: langsmith<0.4,>=0.1.125 in ./.venv/lib/python3.13/site-packages (from langchain_core) (0.3.10)\n",
+      "Requirement already satisfied: tenacity!=8.4.0,<10.0.0,>=8.1.0 in ./.venv/lib/python3.13/site-packages (from langchain_core) (9.0.0)\n",
+      "Requirement already satisfied: jsonpatch<2.0,>=1.33 in ./.venv/lib/python3.13/site-packages (from langchain_core) (1.33)\n",
+      "Requirement already satisfied: PyYAML>=5.3 in ./.venv/lib/python3.13/site-packages (from langchain_core) (6.0.2)\n",
+      "Requirement already satisfied: packaging<25,>=23.2 in ./.venv/lib/python3.13/site-packages (from langchain_core) (24.2)\n",
+      "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in ./.venv/lib/python3.13/site-packages (from langchain_core) (2.10.6)\n",
+      "Requirement already satisfied: SQLAlchemy<3,>=1.4 in ./.venv/lib/python3.13/site-packages (from langchain) (2.0.38)\n",
+      "Requirement already satisfied: requests<3,>=2 in ./.venv/lib/python3.13/site-packages (from langchain) (2.32.3)\n",
+      "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in ./.venv/lib/python3.13/site-packages (from langchain) (3.11.12)\n",
+      "Requirement already satisfied: numpy<3,>=1.26.2 in ./.venv/lib/python3.13/site-packages (from langchain) (2.2.3)\n",
+      "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in ./.venv/lib/python3.13/site-packages (from langchain_community) (0.6.7)\n",
+      "Requirement already satisfied: pydantic-settings<3.0.0,>=2.4.0 in ./.venv/lib/python3.13/site-packages (from langchain_community) (2.8.0)\n",
+      "Requirement already satisfied: httpx-sse<1.0.0,>=0.4.0 in ./.venv/lib/python3.13/site-packages (from langchain_community) (0.4.0)\n",
+      "Requirement already satisfied: soupsieve>1.2 in ./.venv/lib/python3.13/site-packages (from beautifulsoup4) (2.6)\n",
+      "Requirement already satisfied: torch>=1.11.0 in ./.venv/lib/python3.13/site-packages (from sentence-transformers) (2.6.0)\n",
+      "Requirement already satisfied: scikit-learn in ./.venv/lib/python3.13/site-packages (from sentence-transformers) (1.6.1)\n",
+      "Requirement already satisfied: scipy in ./.venv/lib/python3.13/site-packages (from sentence-transformers) (1.15.2)\n",
+      "Requirement already satisfied: six in ./.venv/lib/python3.13/site-packages (from IProgress) (1.17.0)\n",
+      "Requirement already satisfied: filelock in ./.venv/lib/python3.13/site-packages (from huggingface_hub) (3.17.0)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in ./.venv/lib/python3.13/site-packages (from huggingface_hub) (2024.12.0)\n",
+      "Requirement already satisfied: comm>=0.1.3 in ./.venv/lib/python3.13/site-packages (from ipywidgets) (0.2.2)\n",
+      "Requirement already satisfied: ipython>=6.1.0 in ./.venv/lib/python3.13/site-packages (from ipywidgets) (8.32.0)\n",
+      "Requirement already satisfied: traitlets>=4.3.1 in ./.venv/lib/python3.13/site-packages (from ipywidgets) (5.14.3)\n",
+      "Requirement already satisfied: widgetsnbextension~=4.0.12 in ./.venv/lib/python3.13/site-packages (from ipywidgets) (4.0.13)\n",
+      "Requirement already satisfied: jupyterlab-widgets~=3.0.12 in ./.venv/lib/python3.13/site-packages (from ipywidgets) (3.0.13)\n",
+      "Requirement already satisfied: grpcio>=1.41.0 in ./.venv/lib/python3.13/site-packages (from qdrant-client) (1.70.0)\n",
+      "Requirement already satisfied: grpcio-tools>=1.41.0 in ./.venv/lib/python3.13/site-packages (from qdrant-client) (1.70.0)\n",
+      "Requirement already satisfied: httpx>=0.20.0 in ./.venv/lib/python3.13/site-packages (from httpx[http2]>=0.20.0->qdrant-client) (0.28.1)\n",
+      "Requirement already satisfied: portalocker<3.0.0,>=2.7.0 in ./.venv/lib/python3.13/site-packages (from qdrant-client) (2.10.1)\n",
+      "Requirement already satisfied: urllib3<3,>=1.26.14 in ./.venv/lib/python3.13/site-packages (from qdrant-client) (2.3.0)\n",
+      "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (2.4.6)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.2)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (25.1.0)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.5.0)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.1.0)\n",
+      "Requirement already satisfied: propcache>=0.2.0 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (0.3.0)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.17.0 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.18.3)\n",
+      "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in ./.venv/lib/python3.13/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain_community) (3.26.1)\n",
+      "Requirement already satisfied: typing-inspect<1,>=0.4.0 in ./.venv/lib/python3.13/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain_community) (0.9.0)\n",
+      "Requirement already satisfied: protobuf<6.0dev,>=5.26.1 in ./.venv/lib/python3.13/site-packages (from grpcio-tools>=1.41.0->qdrant-client) (5.29.3)\n",
+      "Requirement already satisfied: setuptools in ./.venv/lib/python3.13/site-packages (from grpcio-tools>=1.41.0->qdrant-client) (75.8.0)\n",
+      "Requirement already satisfied: anyio in ./.venv/lib/python3.13/site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (4.8.0)\n",
+      "Requirement already satisfied: certifi in ./.venv/lib/python3.13/site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (2025.1.31)\n",
+      "Requirement already satisfied: httpcore==1.* in ./.venv/lib/python3.13/site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (1.0.7)\n",
+      "Requirement already satisfied: idna in ./.venv/lib/python3.13/site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (3.10)\n",
+      "Requirement already satisfied: h11<0.15,>=0.13 in ./.venv/lib/python3.13/site-packages (from httpcore==1.*->httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (0.14.0)\n",
+      "Requirement already satisfied: h2<5,>=3 in ./.venv/lib/python3.13/site-packages (from httpx[http2]>=0.20.0->qdrant-client) (4.2.0)\n",
+      "Requirement already satisfied: decorator in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (5.2.1)\n",
+      "Requirement already satisfied: jedi>=0.16 in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (0.19.2)\n",
+      "Requirement already satisfied: matplotlib-inline in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (0.1.7)\n",
+      "Requirement already satisfied: pexpect>4.3 in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (4.9.0)\n",
+      "Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.50)\n",
+      "Requirement already satisfied: pygments>=2.4.0 in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (2.19.1)\n",
+      "Requirement already satisfied: stack_data in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)\n",
+      "Requirement already satisfied: jsonpointer>=1.9 in ./.venv/lib/python3.13/site-packages (from jsonpatch<2.0,>=1.33->langchain_core) (3.0.0)\n",
+      "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in ./.venv/lib/python3.13/site-packages (from langsmith<0.4,>=0.1.125->langchain_core) (3.10.15)\n",
+      "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in ./.venv/lib/python3.13/site-packages (from langsmith<0.4,>=0.1.125->langchain_core) (1.0.0)\n",
+      "Requirement already satisfied: zstandard<0.24.0,>=0.23.0 in ./.venv/lib/python3.13/site-packages (from langsmith<0.4,>=0.1.125->langchain_core) (0.23.0)\n",
+      "Requirement already satisfied: distro<2,>=1.7.0 in ./.venv/lib/python3.13/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (1.9.0)\n",
+      "Requirement already satisfied: jiter<1,>=0.4.0 in ./.venv/lib/python3.13/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (0.8.2)\n",
+      "Requirement already satisfied: sniffio in ./.venv/lib/python3.13/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (1.3.1)\n",
+      "Requirement already satisfied: annotated-types>=0.6.0 in ./.venv/lib/python3.13/site-packages (from pydantic<3.0.0,>=2.7.4->langchain_core) (0.7.0)\n",
+      "Requirement already satisfied: pydantic-core==2.27.2 in ./.venv/lib/python3.13/site-packages (from pydantic<3.0.0,>=2.7.4->langchain_core) (2.27.2)\n",
+      "Requirement already satisfied: python-dotenv>=0.21.0 in ./.venv/lib/python3.13/site-packages (from pydantic-settings<3.0.0,>=2.4.0->langchain_community) (1.0.1)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in ./.venv/lib/python3.13/site-packages (from requests<3,>=2->langchain) (3.4.1)\n",
+      "Requirement already satisfied: greenlet!=0.4.17 in ./.venv/lib/python3.13/site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.1.1)\n",
+      "Requirement already satisfied: networkx in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (3.4.2)\n",
+      "Requirement already satisfied: jinja2 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (3.1.5)\n",
+      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
+      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
+      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
+      "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (9.1.0.70)\n",
+      "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.4.5.8)\n",
+      "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (11.2.1.3)\n",
+      "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (10.3.5.147)\n",
+      "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (11.6.1.9)\n",
+      "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.3.1.170)\n",
+      "Requirement already satisfied: nvidia-cusparselt-cu12==0.6.2 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (0.6.2)\n",
+      "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (2.21.5)\n",
+      "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
+      "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
+      "Requirement already satisfied: triton==3.2.0 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (3.2.0)\n",
+      "Requirement already satisfied: sympy==1.13.1 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (1.13.1)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in ./.venv/lib/python3.13/site-packages (from sympy==1.13.1->torch>=1.11.0->sentence-transformers) (1.3.0)\n",
+      "Requirement already satisfied: safetensors>=0.4.1 in ./.venv/lib/python3.13/site-packages (from transformers>=4.39.0->langchain_huggingface) (0.5.2)\n",
+      "Requirement already satisfied: threadpoolctl>=3.1.0 in ./.venv/lib/python3.13/site-packages (from scikit-learn->sentence-transformers) (3.5.0)\n",
+      "Requirement already satisfied: hyperframe<7,>=6.1 in ./.venv/lib/python3.13/site-packages (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client) (6.1.0)\n",
+      "Requirement already satisfied: hpack<5,>=4.1 in ./.venv/lib/python3.13/site-packages (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client) (4.1.0)\n",
+      "Requirement already satisfied: parso<0.9.0,>=0.8.4 in ./.venv/lib/python3.13/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.4)\n",
+      "Requirement already satisfied: ptyprocess>=0.5 in ./.venv/lib/python3.13/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)\n",
+      "Requirement already satisfied: wcwidth in ./.venv/lib/python3.13/site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.13)\n",
+      "Requirement already satisfied: mypy-extensions>=0.3.0 in ./.venv/lib/python3.13/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain_community) (1.0.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in ./.venv/lib/python3.13/site-packages (from jinja2->torch>=1.11.0->sentence-transformers) (3.0.2)\n",
+      "Requirement already satisfied: executing>=1.2.0 in ./.venv/lib/python3.13/site-packages (from stack_data->ipython>=6.1.0->ipywidgets) (2.2.0)\n",
+      "Requirement already satisfied: asttokens>=2.1.0 in ./.venv/lib/python3.13/site-packages (from stack_data->ipython>=6.1.0->ipywidgets) (3.0.0)\n",
+      "Requirement already satisfied: pure-eval in ./.venv/lib/python3.13/site-packages (from stack_data->ipython>=6.1.0->ipywidgets) (0.2.3)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# !pip install nest_asyncio \\\n",
+    "#     langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters \\\n",
+    "#     python-pptx==1.0.2 nltk==3.9.1 pymupdf lxml \\\n",
+    "#     sentence-transformers IProgress \\\n",
+    "#     huggingface_hub ipywidgets \\\n",
+    "#     qdrant-client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import getpass\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter Your OpenAI API Key: \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hf_username = getpass.getpass(\"Enter Your Hugging Face Username: \")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a5c203d394cb4c1d933c1af73ff1c112",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'type': 'user', 'id': '67624d1b57e77fe6e0c87ae5', 'name': 'drewgenai', 'fullname': 'Drew DeMarco', 'email': '[email protected]', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/L6eLaZmCK4jqW3ZTLYIAR.png', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'newotken', 'role': 'write', 'createdAt': '2025-02-12T04:11:04.130Z'}}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from huggingface_hub import whoami\n",
+    "print(whoami())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mkdir: cannot create directory ‘example_files’: File exists\n",
+      "mkdir: cannot create directory ‘output’: File exists\n"
+     ]
+    }
+   ],
+   "source": [
+    "!mkdir example_files\n",
+    "!mkdir output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders import DirectoryLoader\n",
+    "from langchain_community.document_loaders import PyMuPDFLoader\n",
+    "\n",
+    "path = \"example_files/\"\n",
+    "text_loader = DirectoryLoader(path, glob=\"*.pdf\", loader_cls=PyMuPDFLoader)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1️⃣ Header-Based Chunking (Title-Based Splitter)\n",
+    "Uses document structure to split on headings, section titles, or patterns.\n",
+    "Works well for structured documents with named assessments, numbered lists, or headers.\n",
+    "Example: If it detects Chronic Pain Adjustment Index (CPAI-10), it groups everything under that title.\n",
+    "2️⃣ Semantic Chunking (Text-Meaning Splitter)\n",
+    "Uses embeddings or sentence similarity to decide where to break chunks.\n",
+    "Prevents splitting mid-context if sentences are closely related.\n",
+    "Example: Groups all related pain-assessment questions into one chunk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
+    "\n",
+    "\n",
+    "# text_splitter = RecursiveCharacterTextSplitter(\n",
+    "#     chunk_size = 200,\n",
+    "#     chunk_overlap  = 20,\n",
+    "#     length_function = len\n",
+    "# )\n",
+    "\n",
+    "\n",
+    "### potentially use for lenth tokens later"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# #Load documents with metadata\n",
+    "# all_documents = text_loader.load()\n",
+    "# documents_with_metadata = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for doc in all_documents:\n",
+    "#     # Extract document name (assuming PyMuPDFLoader stores the file name in metadata)\n",
+    "#     source_name = doc.metadata.get(\"source\", \"unknown\")\n",
+    "    \n",
+    "#     # Split into chunks while preserving metadata\n",
+    "#     chunks = text_splitter.split_documents([doc])\n",
+    "#     for chunk in chunks:\n",
+    "#         chunk.metadata[\"source\"] = source_name  # Attach source info to each chunk\n",
+    "#     documents_with_metadata.extend(chunks)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###testingbelow\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install langchain_experimental"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_456462/1110142159.py:7: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
+      "  embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain_experimental.text_splitter import SemanticChunker\n",
+    "\n",
+    "from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
+    "\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
+    "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
+    "# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\n",
+    "# embedding_model = HuggingFaceEmbeddings(model_name=model_id, model_kwargs={\"trust_remote_code\": True})\n",
+    "\n",
+    "\n",
+    "semantic_splitter = SemanticChunker(embedding_model)\n",
+    "\n",
+    "all_documents = text_loader.load()\n",
+    "documents_with_metadata = []\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#verify working\n",
+    "# test_doc = all_documents[0].page_content if all_documents else \"\"\n",
+    "# test_chunks = semantic_splitter.split_text(test_doc)\n",
+    "\n",
+    "# print(f\"\\n✅ Total Chunks for First Document: {len(test_chunks)}\")\n",
+    "# for i, chunk in enumerate(test_chunks[:3]):  # Show first 3 chunks\n",
+    "#     print(f\"\\n🔹 Chunk {i+1}: {chunk[:300]}\")  # Print first 300 characters\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.schema import Document\n",
+    "\n",
+    "for doc in all_documents:\n",
+    "    source_name = doc.metadata.get(\"source\", \"unknown\")  # Get document source\n",
+    "\n",
+    "    # Use SemanticChunker to intelligently split text\n",
+    "    chunks = semantic_splitter.split_text(doc.page_content)\n",
+    "\n",
+    "    # Convert chunks into LangChain Document format with metadata\n",
+    "    for chunk in chunks:\n",
+    "        doc_chunk = Document(page_content=chunk, metadata={\"source\": source_name})\n",
+    "        documents_with_metadata.append(doc_chunk)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###testingabove"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "#!pip install -qU huggingface_hub\n",
+    "#!pip install -qU ipywidgets\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "from langchain.vectorstores import Qdrant\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "\n",
+    "\n",
+    "# Load the SentenceTransformer model\n",
+    "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
+    "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
+    "\n",
+    "# Load documents into Qdrant\n",
+    "qdrant_vectorstore = Qdrant.from_documents(\n",
+    "    documents_with_metadata,\n",
+    "    embedding_model,\n",
+    "    location=\":memory:\",  # In-memory for testing\n",
+    "    collection_name=\"document_comparison\",\n",
+    ")\n",
+    "\n",
+    "# Create a retriever\n",
+    "qdrant_retriever = qdrant_vectorstore.as_retriever()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "RAG_PROMPT = \"\"\"\n",
+    "CONTEXT:\n",
+    "{context}\n",
+    "\n",
+    "QUERY:\n",
+    "{question}\n",
+    "\n",
+    "You are a helpful assistant. Use the available context to answer the question.\n",
+    "\n",
+    "Return the response in **valid JSON format** with the following structure:\n",
+    "\n",
+    "[\n",
+    "    {{\n",
+    "        \"Derived Description\": \"A short name for the matched concept\",\n",
+    "        \"Protocol_1\": \"Protocol 1 - Matching Element\",\n",
+    "        \"Protocol_2\": \"Protocol 2 - Matching Element\"\n",
+    "    }},\n",
+    "    ...\n",
+    "]\n",
+    "\n",
+    "### Rules:\n",
+    "1. Only output **valid JSON** with no explanations, summaries, or markdown formatting.\n",
+    "2. Ensure each entry in the JSON list represents a single matched data element from the two protocols.\n",
+    "3. If no matching element is found in a protocol, leave it empty (\"\").\n",
+    "4. **Do NOT include headers, explanations, or additional formatting**—only return the raw JSON list.\n",
+    "5. It should include all the elements in the two protocols.\n",
+    "6. If it cannot match the element, create the row and include the protocol it did find and put \"could not match\" in the other protocol column.\n",
+    "7. protocol should be the between\n",
+    "\"\"\"\n",
+    "\n",
+    "rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)\n",
+    "\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "#openai_chat_model = ChatOpenAI(model=\"gpt-4o\")\n",
+    "openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
+    "\n",
+    "from operator import itemgetter\n",
+    "from langchain.schema.output_parser import StrOutputParser\n",
+    "\n",
+    "rag_chain = (\n",
+    "    {\"context\": itemgetter(\"question\") | qdrant_retriever, \"question\": itemgetter(\"question\")}\n",
+    "    | rag_prompt | openai_chat_model | StrOutputParser()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question_text = \"\"\"You are a helpful assistant. Use the available context to answer the question.\n",
+    "\n",
+    "Between these two files containing protocols, identify and match **entire assessment sections** based on conceptual similarity. Do NOT match individual questions.\n",
+    "\n",
+    "### **Output Format:**\n",
+    "Return the response in **valid JSON format** structured as a list of dictionaries, where each dictionary contains:\n",
+    "\n",
+    "[\n",
+    "    {\n",
+    "        \"Derived Description\": \"A short name describing the matched sections\",\n",
+    "        \"Protocol_1\": \"Exact section heading from Protocol 1\",\n",
+    "        \"Protocol_2\": \"Exact section heading from Protocol 2\"\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "### **Matching Criteria:**\n",
+    "1. **Match entire assessment sections** based on their purpose and overall topic.\n",
+    "3. If a section in one protocol **has no match**, include it but leave the other protocol's field blank.\n",
+    "4. The **\"Derived Description\"** should be a **concise label** summarizing the section’s purpose, . It should describe the overall concept of the matched sections.\n",
+    "\n",
+    "### **Rules:**\n",
+    "1. **Only output valid JSON**—no explanations, summaries, or markdown formatting.\n",
+    "2. **Ensure each entry represents a single section-to-section match.**\n",
+    "4. **Prioritize conceptual similarity over exact wording** when aligning sections.\n",
+    "5. If no match is found, leave the unmatched protocol entry blank.\n",
+    "\n",
+    "### **Example Output:**\n",
+    "[\n",
+    "    {\n",
+    "        \"Derived Description\": \"Pain Coping Strategies\",\n",
+    "        \"Protocol_1\": \"Pain Coping Strategy Scale (PCSS-9)\",\n",
+    "        \"Protocol_2\": \"Chronic Pain Adjustment Index (CPAI-10)\"\n",
+    "    },\n",
+    "    {\n",
+    "        \"Derived Description\": \"Work Stress and Fatigue\",\n",
+    "        \"Protocol_1\": \"Work-Related Stress Scale (WRSS-8)\",\n",
+    "        \"Protocol_2\": \"Occupational Fatigue Index (OFI-7)\"\n",
+    "    },\n",
+    "]\n",
+    "\n",
+    "Do not add any additional text, explanations, or formatting—**only return the raw JSON list**.\n",
+    "\"\"\"\n",
+    "\n",
+    "\n",
+    "\n",
+    "# The questions within elements will be similar between the two documents and can be used to match the elements.\n",
+    "\n",
+    "# 1. Derived description from the two documents describing the index/measure/scale.\n",
+    "# 2. A column for each standard.\n",
+    "# 3. In the column for each name/version, the data element used to capture that description that will be the shortened item between ()\n",
+    "\n",
+    "# There should only be one row for each scale/index/etc.\n",
+    "# The description should not be one of the questions but a name that best describes the similar data elements.\"\"\"\n",
+    "\n",
+    "response_text = rag_chain.invoke({\"question\": question_text})\n",
+    "# response = rag_chain.invoke({\"question\": question_text})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import pandas as pd\n",
+    "\n",
+    "def parse_rag_output(response_text):\n",
+    "    \"\"\"Extract structured JSON data from the RAG response.\"\"\"\n",
+    "    try:\n",
+    "        structured_data = json.loads(response_text)\n",
+    "\n",
+    "        # Ensure similarity score is always included\n",
+    "        for item in structured_data:\n",
+    "            item.setdefault(\"Similarity Score\", \"N/A\")  # Default if missing\n",
+    "\n",
+    "        return structured_data\n",
+    "    except json.JSONDecodeError:\n",
+    "        print(\"Error: Response is not valid JSON.\")\n",
+    "        return None\n",
+    "\n",
+    "def save_to_csv(data, directory=\"./output\", filename=\"matching_data_elements.csv\"):\n",
+    "    \"\"\"Save structured data to CSV.\"\"\"\n",
+    "    if not data:\n",
+    "        print(\"No data to save.\")\n",
+    "        return\n",
+    "\n",
+    "    file_path = os.path.join(directory, filename)\n",
+    "    df = pd.DataFrame(data, columns=[\"Derived Description\", \"Protocol_1\", \"Protocol_2\"])  # Ensure correct columns\n",
+    "    df.to_csv(file_path, index=False)\n",
+    "    print(f\"✅ CSV file saved: {filename}\")\n",
+    "\n",
+    "# Run the pipeline\n",
+    "structured_output = parse_rag_output(response_text)\n",
+    "save_to_csv(structured_output)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'[\\n    {\\n        \"Derived Description\": \"Memory Recall\",\\n        \"Protocol_1_Name\": \"I struggle to remember names and faces. (Scale: 0-3)\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    },\\n    {\\n        \"Derived Description\": \"Memory Retention\",\\n        \"Protocol_1_Name\": \"I retain new information effectively.\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    },\\n    {\\n        \"Derived Description\": \"Mnemonic Techniques\",\\n        \"Protocol_1_Name\": \"I practice mnemonic techniques to improve recall.\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    },\\n    {\\n        \"Derived Description\": \"Task Management Difficulty\",\\n        \"Protocol_1_Name\": \"could not match\",\\n        \"Protocol_2_Name\": \"I find it difficult to keep track of multiple responsibilities. (Scale: 0-3)\"\\n    },\\n    {\\n        \"Derived Description\": \"Mental Fatigue in Problem-Solving\",\\n        \"Protocol_1_Name\": \"could not match\",\\n        \"Protocol_2_Name\": \"I get mentally fatigued quickly when problem-solving. (Scale: 0-3)\"\\n    },\\n    {\\n        \"Derived Description\": \"Task Organization Techniques\",\\n        \"Protocol_1_Name\": \"could not match\",\\n        \"Protocol_2_Name\": \"I use structured techniques to organize my tasks. (Scale: 0-3)\"\\n    }\\n]'"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# rag_chain.invoke({\"question\" : \"Based on the types of questions asked under each heading.  can you identify the headings in one document that most closely match the second document.  list them e.g   paincoping/doc1  painstrategy/doc2\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'[\\n    {\\n        \"Derived Description\": \"Memory Recall\",\\n        \"Protocol_1_Name\": \"I struggle to remember names and faces.\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    },\\n    {\\n        \"Derived Description\": \"Retaining Information\",\\n        \"Protocol_1_Name\": \"I retain new information effectively.\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    },\\n    {\\n        \"Derived Description\": \"Mnemonic Techniques\",\\n        \"Protocol_1_Name\": \"could not match\",\\n        \"Protocol_2_Name\": \"I practice mnemonic techniques to improve recall.\"\\n    },\\n    {\\n        \"Derived Description\": \"Pain Management Preparation\",\\n        \"Protocol_1_Name\": \"I mentally prepare myself before engaging in painful activities.\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    },\\n    {\\n        \"Derived Description\": \"Pain Minimization Techniques\",\\n        \"Protocol_1_Name\": \"I use relaxation techniques to minimize pain perception.\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    },\\n    {\\n        \"Derived Description\": \"Breathing Exercises for Pain\",\\n        \"Protocol_1_Name\": \"I use breathing exercises to manage pain episodes.\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    },\\n    {\\n        \"Derived Description\": \"Avoiding Painful Activities\",\\n        \"Protocol_1_Name\": \"I avoid specific physical activities that increase my pain.\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    },\\n    {\\n        \"Derived Description\": \"Work Exhaustion\",\\n        \"Protocol_1_Name\": \"I feel exhausted after a standard workday.\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    },\\n    {\\n        \"Derived Description\": \"Motivation and Stress\",\\n        \"Protocol_1_Name\": \"I struggle to stay motivated due to workplace stress.\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    },\\n    {\\n        \"Derived Description\": \"Handling Multiple Responsibilities\",\\n        \"Protocol_1_Name\": \"could not match\",\\n        \"Protocol_2_Name\": \"I find it difficult to keep track of multiple responsibilities.\"\\n    },\\n    {\\n        \"Derived Description\": \"Mental Fatigue from Problem-Solving\",\\n        \"Protocol_1_Name\": \"could not match\",\\n        \"Protocol_2_Name\": \"I get mentally fatigued quickly when problem-solving.\"\\n    },\\n    {\\n        \"Derived Description\": \"Structured Task Organization\",\\n        \"Protocol_1_Name\": \"could not match\",\\n        \"Protocol_2_Name\": \"I use structured techniques to organize my tasks.\"\\n    },\\n    {\\n        \"Derived Description\": \"Overwhelmed by Responsibilities\",\\n        \"Protocol_1_Name\": \"I feel overwhelmed when handling multiple responsibilities.\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    },\\n    {\\n        \"Derived Description\": \"Disconnecting from Work\",\\n        \"Protocol_1_Name\": \"I find it difficult to disconnect from work-related concerns.\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    },\\n    {\\n        \"Derived Description\": \"Sleep Disturbances from Work Stress\",\\n        \"Protocol_1_Name\": \"I experience sleep disturbances due to work-related stress.\",\\n        \"Protocol_2_Name\": \"could not match\"\\n    }\\n]'"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# rag_chain.invoke({\"question\" : \"Based on the types of questions asked under each heading.  can you identify the headings in one document that most closely match the second document.  list them e.g   paincoping/doc1  painstrategy/doc2. these are example headings not the ones in the actual documents.  just list the matches not the rational.  Can you list multiple matches?\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

02-testembedtune copy.ipynb ADDED Viewed

	@@ -0,0 +1,1282 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install nest_asyncio \\\n",
+    "#     langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters \\\n",
+    "#     python-pptx==1.0.2 nltk==3.9.1 pymupdf lxml \\\n",
+    "#     sentence-transformers IProgress \\\n",
+    "#     huggingface_hub ipywidgets \\\n",
+    "#     qdrant-client langchain_experimental\n",
+    "\n",
+    "# !pip install sentence_transformers datasets pyarrow\n",
+    "# !pip install torch\n",
+    "# !pip install accelerate>=0.26.0\n",
+    "# !pip install transformers\n",
+    "# !pip install wandb\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install -qU langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install -qU faiss-cpu python-pptx==1.0.2 nltk==3.9.1 pymupdf beautifulsoup4 lxml"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install -qU sentence-transformers\n",
+    "#!pip install -qU IProgress\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import getpass\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter Your OpenAI API Key: \")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hf_username = getpass.getpass(\"Enter Your Hugging Face Username: \")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "df7fbe16b4c44797abc886b87583af59",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'type': 'user', 'id': '67624d1b57e77fe6e0c87ae5', 'name': 'drewgenai', 'fullname': 'Drew DeMarco', 'email': '[email protected]', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/L6eLaZmCK4jqW3ZTLYIAR.png', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'newotken', 'role': 'write', 'createdAt': '2025-02-12T04:11:04.130Z'}}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from huggingface_hub import whoami\n",
+    "print(whoami())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mkdir: cannot create directory ‘example_files’: File exists\n",
+      "mkdir: cannot create directory ‘output’: File exists\n"
+     ]
+    }
+   ],
+   "source": [
+    "!mkdir example_files\n",
+    "!mkdir output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders import DirectoryLoader\n",
+    "from langchain_community.document_loaders import PyMuPDFLoader\n",
+    "\n",
+    "path = \"example_files/\"\n",
+    "text_loader = DirectoryLoader(path, glob=\"*.pdf\", loader_cls=PyMuPDFLoader)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1️⃣ Header-Based Chunking (Title-Based Splitter)\n",
+    "Uses document structure to split on headings, section titles, or patterns.\n",
+    "Works well for structured documents with named assessments, numbered lists, or headers.\n",
+    "Example: If it detects Chronic Pain Adjustment Index (CPAI-10), it groups everything under that title.\n",
+    "2️⃣ Semantic Chunking (Text-Meaning Splitter)\n",
+    "Uses embeddings or sentence similarity to decide where to break chunks.\n",
+    "Prevents splitting mid-context if sentences are closely related.\n",
+    "Example: Groups all related pain-assessment questions into one chunk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###testingbelow\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install langchain_experimental"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "# #might need to remove all together - don't think it's working\n",
+    "# !pip install --upgrade langchain langchain-experimental\n",
+    "# !pip install --upgrade langchain-community\n",
+    "# !pip install langchain langchain-experimental langchain-community\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_76652/2495904805.py:7: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
+      "  embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "from langchain_experimental.text_splitter import SemanticChunker\n",
+    "\n",
+    "from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
+    "\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
+    "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
+    "\n",
+    "semantic_splitter = SemanticChunker(embedding_model)\n",
+    "\n",
+    "all_documents = text_loader.load()\n",
+    "documents_with_metadata = []\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.schema import Document\n",
+    "\n",
+    "for doc in all_documents:\n",
+    "    source_name = doc.metadata.get(\"source\", \"unknown\")  # Get document source\n",
+    "\n",
+    "    # Use SemanticChunker to intelligently split text\n",
+    "    chunks = semantic_splitter.split_text(doc.page_content)\n",
+    "\n",
+    "    # Convert chunks into LangChain Document format with metadata\n",
+    "    for chunk in chunks:\n",
+    "        doc_chunk = Document(page_content=chunk, metadata={\"source\": source_name})\n",
+    "        documents_with_metadata.append(doc_chunk)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##########################new testing below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#training_documents = text_loader.load()\n",
+    "training_documents = documents_with_metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import uuid\n",
+    "\n",
+    "id_set = set()\n",
+    "\n",
+    "for document in training_documents:\n",
+    "  id = str(uuid.uuid4())\n",
+    "  while id in id_set:\n",
+    "    id = uuid.uuid4()\n",
+    "  id_set.add(id)\n",
+    "  document.metadata[\"id\"] = id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training set: 4 docs\n",
+      "Validation set: 1 docs\n",
+      "Test set: 2 docs\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define split percentages\n",
+    "train_ratio = 0.7  # 70% training\n",
+    "val_ratio = 0.2    # 20% validation\n",
+    "test_ratio = 0.1   # 10% test\n",
+    "\n",
+    "# Calculate index breakpoints\n",
+    "total_docs = len(training_documents)\n",
+    "train_size = int(total_docs * train_ratio)\n",
+    "val_size = int(total_docs * val_ratio)\n",
+    "\n",
+    "# Perform the splits\n",
+    "training_split_documents = training_documents[:train_size]\n",
+    "val_split_documents = training_documents[train_size:train_size + val_size]\n",
+    "test_split_documents = training_documents[train_size + val_size:]\n",
+    "\n",
+    "# Print sizes to verify\n",
+    "print(f\"Training set: {len(training_split_documents)} docs\")\n",
+    "print(f\"Validation set: {len(val_split_documents)} docs\")\n",
+    "print(f\"Test set: {len(test_split_documents)} docs\")\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "qa_chat_model = ChatOpenAI(\n",
+    "    model=\"gpt-4o-mini\",\n",
+    "    temperature=0\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "\n",
+    "qa_prompt = \"\"\"\\\n",
+    "Given the following context, you must generate questions based on only the provided context.\n",
+    "\n",
+    "You are to generate {n_questions} questions which should be provided in the following format:\n",
+    "\n",
+    "1. QUESTION #1\n",
+    "2. QUESTION #2\n",
+    "...\n",
+    "\n",
+    "Context:\n",
+    "{context}\n",
+    "\"\"\"\n",
+    "\n",
+    "qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question_generation_chain = qa_prompt_template | qa_chat_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import asyncio\n",
+    "import uuid\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "async def process_document(document, n_questions):\n",
+    "    questions_generated = await question_generation_chain.ainvoke({\"context\": document.page_content, \"n_questions\": n_questions})\n",
+    "\n",
+    "    doc_questions = {}\n",
+    "    doc_relevant_docs = {}\n",
+    "\n",
+    "    for question in questions_generated.content.split(\"\\n\"):\n",
+    "        question_id = str(uuid.uuid4())\n",
+    "        doc_questions[question_id] = \"\".join(question.split(\".\")[1:]).strip()\n",
+    "        doc_relevant_docs[question_id] = [document.metadata[\"id\"]]\n",
+    "\n",
+    "    return doc_questions, doc_relevant_docs\n",
+    "\n",
+    "async def create_questions(documents, n_questions):\n",
+    "    tasks = [process_document(doc, n_questions) for doc in documents]\n",
+    "\n",
+    "    questions = {}\n",
+    "    relevant_docs = {}\n",
+    "\n",
+    "    for task in tqdm(asyncio.as_completed(tasks), total=len(documents), desc=\"Processing documents\"):\n",
+    "        doc_questions, doc_relevant_docs = await task\n",
+    "        questions.update(doc_questions)\n",
+    "        relevant_docs.update(doc_relevant_docs)\n",
+    "\n",
+    "    return questions, relevant_docs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processing documents: 100%|██████████| 4/4 [00:01<00:00,  3.75it/s]\n",
+      "Processing documents: 100%|██████████| 1/1 [00:00<00:00,  1.21it/s]\n",
+      "Processing documents: 100%|██████████| 2/2 [00:01<00:00,  1.98it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)\n",
+    "val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)\n",
+    "test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "training_corpus = {train_item.metadata[\"id\"] : train_item.page_content for train_item in training_split_documents}\n",
+    "\n",
+    "train_dataset = {\n",
+    "    \"questions\" : training_questions,\n",
+    "    \"relevant_contexts\" : training_relevant_contexts,\n",
+    "    \"corpus\" : training_corpus\n",
+    "}\n",
+    "\n",
+    "with open(\"training_dataset.jsonl\", \"w\") as f:\n",
+    "  json.dump(train_dataset, f)\n",
+    "\n",
+    "\n",
+    "val_corpus = {val_item.metadata[\"id\"] : val_item.page_content for val_item in val_split_documents}\n",
+    "\n",
+    "val_dataset = {\n",
+    "    \"questions\" : val_questions,\n",
+    "    \"relevant_contexts\" : val_relevant_contexts,\n",
+    "    \"corpus\" : val_corpus\n",
+    "}\n",
+    "\n",
+    "with open(\"val_dataset.jsonl\", \"w\") as f:\n",
+    "  json.dump(val_dataset, f)\n",
+    "\n",
+    "\n",
+    "train_corpus = {test_item.metadata[\"id\"] : test_item.page_content for test_item in test_split_documents}\n",
+    "\n",
+    "test_dataset = {\n",
+    "    \"questions\" : test_questions,\n",
+    "    \"relevant_contexts\" : test_relevant_contexts,\n",
+    "    \"corpus\" : train_corpus\n",
+    "}\n",
+    "\n",
+    "with open(\"test_dataset.jsonl\", \"w\") as f:\n",
+    "  json.dump(test_dataset, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install -qU sentence_transformers datasets pyarrow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
+    "model = SentenceTransformer(model_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import DataLoader\n",
+    "from torch.utils.data import Dataset\n",
+    "from sentence_transformers import InputExample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BATCH_SIZE = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "corpus = train_dataset['corpus']\n",
+    "queries = train_dataset['questions']\n",
+    "relevant_docs = train_dataset['relevant_contexts']\n",
+    "\n",
+    "examples = []\n",
+    "for query_id, query in queries.items():\n",
+    "    doc_id = relevant_docs[query_id][0]\n",
+    "    text = corpus[doc_id]\n",
+    "    example = InputExample(texts=[query, text])\n",
+    "    examples.append(example)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = DataLoader(\n",
+    "    examples, batch_size=BATCH_SIZE\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss\n",
+    "\n",
+    "matryoshka_dimensions = [768, 512, 256, 128, 64]\n",
+    "inner_train_loss = MultipleNegativesRankingLoss(model)\n",
+    "train_loss = MatryoshkaLoss(\n",
+    "    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sentence_transformers.evaluation import InformationRetrievalEvaluator\n",
+    "\n",
+    "corpus = val_dataset['corpus']\n",
+    "queries = val_dataset['questions']\n",
+    "relevant_docs = val_dataset['relevant_contexts']\n",
+    "\n",
+    "evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EPOCHS = 5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<button onClick=\"this.nextSibling.style.display='block';this.style.display='none';\">Display W&B run</button><iframe src='https://wandb.ai/dummy/dummy/runs/bel6hiln?jupyter=true' style='border:none;width:100%;height:420px;display:none;'></iframe>"
+      ],
+      "text/plain": [
+       "<wandb.sdk.wandb_run.Run at 0x72704850af90>"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#!pip install wandb\n",
+    "\n",
+    "import wandb\n",
+    "wandb.init(mode=\"disabled\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install torch\n",
+    "# !pip install accelerate>=0.26.0\n",
+    "# !pip install transformers\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install --upgrade --force-reinstall transformers accelerate torch\n",
+    "#!which python\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='5' max='5' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [5/5 00:01, Epoch 5/5]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Cosine Accuracy@1</th>\n",
+       "      <th>Cosine Accuracy@3</th>\n",
+       "      <th>Cosine Accuracy@5</th>\n",
+       "      <th>Cosine Accuracy@10</th>\n",
+       "      <th>Cosine Precision@1</th>\n",
+       "      <th>Cosine Precision@3</th>\n",
+       "      <th>Cosine Precision@5</th>\n",
+       "      <th>Cosine Precision@10</th>\n",
+       "      <th>Cosine Recall@1</th>\n",
+       "      <th>Cosine Recall@3</th>\n",
+       "      <th>Cosine Recall@5</th>\n",
+       "      <th>Cosine Recall@10</th>\n",
+       "      <th>Cosine Ndcg@10</th>\n",
+       "      <th>Cosine Mrr@10</th>\n",
+       "      <th>Cosine Map@100</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.200000</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.200000</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.200000</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.200000</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.200000</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "warmup_steps = int(len(loader) * EPOCHS * 0.1)\n",
+    "\n",
+    "model.fit(\n",
+    "    train_objectives=[(loader, train_loss)],\n",
+    "    epochs=EPOCHS,\n",
+    "    warmup_steps=warmup_steps,\n",
+    "    output_path='models/midterm-compare-arctic-embed-m-ft',\n",
+    "    show_progress_bar=True,\n",
+    "    evaluator=evaluator,\n",
+    "    evaluation_steps=50\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c3832f15349447c59ef0b7950d732a59",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'https://huggingface.co/drewgenai/midterm-compare-arctic-embed-m-ft/commit/695a90e0d9d4a6ca560a5844c0e5a7cf4c4c74a9'"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.push_to_hub(f\"{hf_username}/midterm-compare-arctic-embed-m-ft\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5a84694a9cff451581d43a244cbd6ce5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d9635815ad784cc68833f2b4199c611b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config_sentence_transformers.json:   0%|          | 0.00/281 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b425eef83f6c47cf90d9ad8df35bed07",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "README.md:   0%|          | 0.00/26.3k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1c080b01bb4c43e3b0af3da190feff91",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8ebbd4faaa99434fbd6413f24fadc8b1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/675 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5ef43ded862f4e5685af4b66e51922af",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of BertModel were not initialized from the model checkpoint at drewgenai/midterm-compare-arctic-embed-m-ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f2704b3d8d214414acf54e23efb2de25",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "70d0aca65df94b8c973d9e2aef700c6b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b8a288bc2740416d8be044c1534138a0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fd5494a1b2d2483884ccdfeaaf03e65c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e6259269b65b45358940c42ac8e9d127",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "1_Pooling%2Fconfig.json:   0%|          | 0.00/296 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "finetune_embeddings = HuggingFaceEmbeddings(model_name=f\"{hf_username}/midterm-compare-arctic-embed-m-ft\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###testingabove"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "#!pip install -qU huggingface_hub\n",
+    "#!pip install -qU ipywidgets\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of BertModel were not initialized from the model checkpoint at drewgenai/demo-compare-arctic-embed-m-ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "from langchain.vectorstores import Qdrant\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "\n",
+    "\n",
+    "# Load the SentenceTransformer model\n",
+    "#model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
+    "model_id = f\"{hf_username}/demo-compare-arctic-embed-m-ft\"  \n",
+    "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
+    "# model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
+    "# embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
+    "# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\n",
+    "# embedding_model = HuggingFaceEmbeddings(model_name=model_id, model_kwargs={\"trust_remote_code\": True})\n",
+    "\n",
+    "\n",
+    "# Load documents into Qdrant\n",
+    "qdrant_vectorstore = Qdrant.from_documents(\n",
+    "    documents_with_metadata,\n",
+    "    embedding_model,\n",
+    "    location=\":memory:\",  # In-memory for testing\n",
+    "    collection_name=\"document_comparison\",\n",
+    ")\n",
+    "\n",
+    "# Create a retriever\n",
+    "qdrant_retriever = qdrant_vectorstore.as_retriever()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from langchain_core.prompts import ChatPromptTemplate\n",
+    "\n",
+    "# RAG_PROMPT = \"\"\"\n",
+    "# CONTEXT:\n",
+    "# {context}\n",
+    "\n",
+    "# QUERY:\n",
+    "# {question}\n",
+    "\n",
+    "# You are a helpful assistant. Use the available context to answer the question. If you can't answer the question, say you don't know.\n",
+    "# \"\"\"\n",
+    "\n",
+    "# rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)\n",
+    "\n",
+    "# from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "# #openai_chat_model = ChatOpenAI(model=\"gpt-4o\")\n",
+    "# openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
+    "\n",
+    "# from operator import itemgetter\n",
+    "# from langchain.schema.output_parser import StrOutputParser\n",
+    "\n",
+    "# rag_chain = (\n",
+    "#     {\"context\": itemgetter(\"question\") | qdrant_retriever, \"question\": itemgetter(\"question\")}\n",
+    "#     | rag_prompt | openai_chat_model | StrOutputParser()\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "RAG_PROMPT = \"\"\"\n",
+    "CONTEXT:\n",
+    "{context}\n",
+    "\n",
+    "QUERY:\n",
+    "{question}\n",
+    "\n",
+    "You are a helpful assistant. Use the available context to answer the question.\n",
+    "\n",
+    "Return the response in **valid JSON format** with the following structure:\n",
+    "\n",
+    "[\n",
+    "    {{\n",
+    "        \"Derived Description\": \"A short name for the matched concept\",\n",
+    "        \"Protocol_1_Name\": \"Protocol 1 - Matching Element\",\n",
+    "        \"Protocol_2_Name\": \"Protocol 2 - Matching Element\"\n",
+    "    }},\n",
+    "    ...\n",
+    "]\n",
+    "\n",
+    "### Rules:\n",
+    "1. Only output **valid JSON** with no explanations, summaries, or markdown formatting.\n",
+    "2. Ensure each entry in the JSON list represents a single matched data element from the two protocols.\n",
+    "3. If no matching element is found in a protocol, leave it empty (\"\").\n",
+    "4. **Do NOT include headers, explanations, or additional formatting**—only return the raw JSON list.\n",
+    "5. It should include all the elements in the two protocols.\n",
+    "6. If it cannot match the element, create the row and include the protocol it did find and put \"could not match\" in the other protocol column.\n",
+    "\"\"\"\n",
+    "\n",
+    "rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)\n",
+    "\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "#openai_chat_model = ChatOpenAI(model=\"gpt-4o\")\n",
+    "openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
+    "\n",
+    "from operator import itemgetter\n",
+    "from langchain.schema.output_parser import StrOutputParser\n",
+    "\n",
+    "rag_chain = (\n",
+    "    {\"context\": itemgetter(\"question\") | qdrant_retriever, \"question\": itemgetter(\"question\")}\n",
+    "    | rag_prompt | openai_chat_model | StrOutputParser()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question_text = \"\"\"Between these two files containing protocols, can you find the data elements in each that most likely match the element in the other and output a CSV that lists three columns:\n",
+    "\n",
+    "The questions within elements will be similar between the two documents and can be used to match the elements.\n",
+    "\n",
+    "1. Derived description from the two documents describing the index/measure/scale.\n",
+    "2. A column for each standard.\n",
+    "3. In the column for each name/version, the data element used to capture that description.\n",
+    "\n",
+    "There should only be one row for each scale/index/etc.\n",
+    "The description should not be one of the questions but a name that best describes the similar data elements.\"\"\"\n",
+    "\n",
+    "response_text = rag_chain.invoke({\"question\": question_text})\n",
+    "# response = rag_chain.invoke({\"question\": question_text})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ CSV file saved: matching_data_elements.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import pandas as pd\n",
+    "\n",
+    "def parse_rag_output(response_text):\n",
+    "    \"\"\"Extract structured JSON data from the RAG response.\"\"\"\n",
+    "    try:\n",
+    "        structured_data = json.loads(response_text)\n",
+    "\n",
+    "        # Ensure similarity score is always included\n",
+    "        for item in structured_data:\n",
+    "            item.setdefault(\"Similarity Score\", \"N/A\")  # Default if missing\n",
+    "\n",
+    "        return structured_data\n",
+    "    except json.JSONDecodeError:\n",
+    "        print(\"Error: Response is not valid JSON.\")\n",
+    "        return None\n",
+    "\n",
+    "def save_to_csv(data, directory=\"./output\", filename=\"matching_data_elements.csv\"):\n",
+    "    \"\"\"Save structured data to CSV.\"\"\"\n",
+    "    if not data:\n",
+    "        print(\"No data to save.\")\n",
+    "        return\n",
+    "\n",
+    "    file_path = os.path.join(directory, filename)\n",
+    "    df = pd.DataFrame(data, columns=[\"Derived Description\", \"Protocol_1_Name\", \"Protocol_2_Name\"])  # Ensure correct columns\n",
+    "    df.to_csv(file_path, index=False)\n",
+    "    print(f\"✅ CSV file saved: {filename}\")\n",
+    "\n",
+    "# Run the pipeline\n",
+    "structured_output = parse_rag_output(response_text)\n",
+    "save_to_csv(structured_output)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# rag_chain.invoke({\"question\" : \"Based on the types of questions asked under each heading.  can you identify the headings in one document that most closely match the second document.  list them e.g   paincoping/doc1  painstrategy/doc2\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# rag_chain.invoke({\"question\" : \"Based on the types of questions asked under each heading.  can you identify the headings in one document that most closely match the second document.  list them e.g   paincoping/doc1  painstrategy/doc2. these are example headings not the ones in the actual documents.  just list the matches not the rational.  Can you list multiple matches?\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

03-testembedtune.ipynb ADDED Viewed

	@@ -0,0 +1,1861 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install nest_asyncio \\\n",
+    "#     langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters \\\n",
+    "#     python-pptx==1.0.2 nltk==3.9.1 pymupdf lxml \\\n",
+    "#     sentence-transformers IProgress \\\n",
+    "#     huggingface_hub ipywidgets \\\n",
+    "#     qdrant-client langchain_experimental\n",
+    "\n",
+    "# !pip install sentence_transformers datasets pyarrow\n",
+    "# !pip install torch\n",
+    "# !pip install accelerate>=0.26.0\n",
+    "# !pip install transformers\n",
+    "# !pip install wandb\n",
+    "# !pip install ragas\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import getpass\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter Your OpenAI API Key: \")\n",
+    "os.environ[\"RAGAS_APP_TOKEN\"] = getpass.getpass(\"Please enter your Ragas API key!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hf_username = getpass.getpass(\"Enter Your Hugging Face Username: \")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2098545c1f924b7c85f8b7ca809f6f1a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token has not been saved to git credential helper.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'type': 'user', 'id': '67624d1b57e77fe6e0c87ae5', 'name': 'drewgenai', 'fullname': 'Drew DeMarco', 'email': '[email protected]', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/L6eLaZmCK4jqW3ZTLYIAR.png', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'newotken', 'role': 'write', 'createdAt': '2025-02-12T04:11:04.130Z'}}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from huggingface_hub import whoami\n",
+    "print(whoami())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mkdir: cannot create directory ‘example_files’: File exists\n",
+      "mkdir: cannot create directory ‘output’: File exists\n"
+     ]
+    }
+   ],
+   "source": [
+    "!mkdir example_files\n",
+    "!mkdir output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders import DirectoryLoader\n",
+    "from langchain_community.document_loaders import PyMuPDFLoader\n",
+    "\n",
+    "path = \"example_files/\"\n",
+    "text_loader = DirectoryLoader(path, glob=\"*.pdf\", loader_cls=PyMuPDFLoader)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1️⃣ Header-Based Chunking (Title-Based Splitter)\n",
+    "Uses document structure to split on headings, section titles, or patterns.\n",
+    "Works well for structured documents with named assessments, numbered lists, or headers.\n",
+    "Example: If it detects Chronic Pain Adjustment Index (CPAI-10), it groups everything under that title.\n",
+    "2️⃣ Semantic Chunking (Text-Meaning Splitter)\n",
+    "Uses embeddings or sentence similarity to decide where to break chunks.\n",
+    "Prevents splitting mid-context if sentences are closely related.\n",
+    "Example: Groups all related pain-assessment questions into one chunk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###testingbelow\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "from langchain_experimental.text_splitter import SemanticChunker\n",
+    "\n",
+    "from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
+    "\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
+    "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
+    "\n",
+    "semantic_splitter = SemanticChunker(embedding_model)\n",
+    "\n",
+    "all_documents = text_loader.load()\n",
+    "documents_with_metadata = []\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.schema import Document\n",
+    "\n",
+    "for doc in all_documents:\n",
+    "    source_name = doc.metadata.get(\"source\", \"unknown\")  # Get document source\n",
+    "\n",
+    "    # Use SemanticChunker to intelligently split text\n",
+    "    chunks = semantic_splitter.split_text(doc.page_content)\n",
+    "\n",
+    "    # Convert chunks into LangChain Document format with metadata\n",
+    "    for chunk in chunks:\n",
+    "        doc_chunk = Document(page_content=chunk, metadata={\"source\": source_name})\n",
+    "        documents_with_metadata.append(doc_chunk)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##########################new testing below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#training_documents = text_loader.load()\n",
+    "### keeping documents_with_metadata and training_documents separate for now\n",
+    "\n",
+    "\n",
+    "from langchain.schema import Document\n",
+    "\n",
+    "training_documents = []\n",
+    "\n",
+    "\n",
+    "for doc in all_documents:\n",
+    "    source_name = doc.metadata.get(\"source\", \"unknown\")  # Get document source\n",
+    "\n",
+    "    # Use SemanticChunker to intelligently split text\n",
+    "    chunks = semantic_splitter.split_text(doc.page_content)\n",
+    "\n",
+    "    # Convert chunks into LangChain Document format with metadata\n",
+    "    for chunk in chunks:\n",
+    "        doc_chunk = Document(page_content=chunk, metadata={\"source\": source_name})\n",
+    "        training_documents.append(doc_chunk)\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import uuid\n",
+    "\n",
+    "id_set = set()\n",
+    "\n",
+    "for document in training_documents:\n",
+    "  id = str(uuid.uuid4())\n",
+    "  while id in id_set:\n",
+    "    id = uuid.uuid4()\n",
+    "  id_set.add(id)\n",
+    "  document.metadata[\"id\"] = id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training set: 9 docs\n",
+      "Validation set: 2 docs\n",
+      "Test set: 3 docs\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define split percentages\n",
+    "train_ratio = 0.7  # 70% training\n",
+    "val_ratio = 0.2    # 20% validation\n",
+    "test_ratio = 0.1   # 10% test\n",
+    "\n",
+    "# Calculate index breakpoints\n",
+    "total_docs = len(training_documents)\n",
+    "train_size = int(total_docs * train_ratio)\n",
+    "val_size = int(total_docs * val_ratio)\n",
+    "\n",
+    "# Perform the splits\n",
+    "training_split_documents = training_documents[:train_size]\n",
+    "val_split_documents = training_documents[train_size:train_size + val_size]\n",
+    "test_split_documents = training_documents[train_size + val_size:]\n",
+    "\n",
+    "# Print sizes to verify\n",
+    "print(f\"Training set: {len(training_split_documents)} docs\")\n",
+    "print(f\"Validation set: {len(val_split_documents)} docs\")\n",
+    "print(f\"Test set: {len(test_split_documents)} docs\")\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "qa_chat_model = ChatOpenAI(\n",
+    "    model=\"gpt-4o-mini\",\n",
+    "    temperature=0\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "\n",
+    "qa_prompt = \"\"\"\\\n",
+    "Given the following context, you must generate questions based on only the provided context.\n",
+    "\n",
+    "You are to generate {n_questions} questions which should be provided in the following format:\n",
+    "\n",
+    "1. QUESTION #1\n",
+    "2. QUESTION #2\n",
+    "...\n",
+    "\n",
+    "Context:\n",
+    "{context}\n",
+    "\"\"\"\n",
+    "\n",
+    "qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question_generation_chain = qa_prompt_template | qa_chat_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import asyncio\n",
+    "import uuid\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "async def process_document(document, n_questions):\n",
+    "    questions_generated = await question_generation_chain.ainvoke({\"context\": document.page_content, \"n_questions\": n_questions})\n",
+    "\n",
+    "    doc_questions = {}\n",
+    "    doc_relevant_docs = {}\n",
+    "\n",
+    "    for question in questions_generated.content.split(\"\\n\"):\n",
+    "        question_id = str(uuid.uuid4())\n",
+    "        doc_questions[question_id] = \"\".join(question.split(\".\")[1:]).strip()\n",
+    "        doc_relevant_docs[question_id] = [document.metadata[\"id\"]]\n",
+    "\n",
+    "    return doc_questions, doc_relevant_docs\n",
+    "\n",
+    "async def create_questions(documents, n_questions):\n",
+    "    tasks = [process_document(doc, n_questions) for doc in documents]\n",
+    "\n",
+    "    questions = {}\n",
+    "    relevant_docs = {}\n",
+    "\n",
+    "    for task in tqdm(asyncio.as_completed(tasks), total=len(documents), desc=\"Processing documents\"):\n",
+    "        doc_questions, doc_relevant_docs = await task\n",
+    "        questions.update(doc_questions)\n",
+    "        relevant_docs.update(doc_relevant_docs)\n",
+    "\n",
+    "    return questions, relevant_docs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processing documents: 100%|██████████| 9/9 [00:02<00:00,  4.44it/s]\n",
+      "Processing documents: 100%|██████████| 2/2 [00:01<00:00,  1.74it/s]\n",
+      "Processing documents: 100%|██████████| 3/3 [00:02<00:00,  1.50it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)\n",
+    "val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)\n",
+    "test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "training_corpus = {train_item.metadata[\"id\"] : train_item.page_content for train_item in training_split_documents}\n",
+    "\n",
+    "train_dataset = {\n",
+    "    \"questions\" : training_questions,\n",
+    "    \"relevant_contexts\" : training_relevant_contexts,\n",
+    "    \"corpus\" : training_corpus\n",
+    "}\n",
+    "\n",
+    "with open(\"training_dataset.jsonl\", \"w\") as f:\n",
+    "  json.dump(train_dataset, f)\n",
+    "\n",
+    "\n",
+    "val_corpus = {val_item.metadata[\"id\"] : val_item.page_content for val_item in val_split_documents}\n",
+    "\n",
+    "val_dataset = {\n",
+    "    \"questions\" : val_questions,\n",
+    "    \"relevant_contexts\" : val_relevant_contexts,\n",
+    "    \"corpus\" : val_corpus\n",
+    "}\n",
+    "\n",
+    "with open(\"val_dataset.jsonl\", \"w\") as f:\n",
+    "  json.dump(val_dataset, f)\n",
+    "\n",
+    "\n",
+    "train_corpus = {test_item.metadata[\"id\"] : test_item.page_content for test_item in test_split_documents}\n",
+    "\n",
+    "test_dataset = {\n",
+    "    \"questions\" : test_questions,\n",
+    "    \"relevant_contexts\" : test_relevant_contexts,\n",
+    "    \"corpus\" : train_corpus\n",
+    "}\n",
+    "\n",
+    "with open(\"test_dataset.jsonl\", \"w\") as f:\n",
+    "  json.dump(test_dataset, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install -qU sentence_transformers datasets pyarrow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
+    "model = SentenceTransformer(model_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import DataLoader\n",
+    "from torch.utils.data import Dataset\n",
+    "from sentence_transformers import InputExample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BATCH_SIZE = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "corpus = train_dataset['corpus']\n",
+    "queries = train_dataset['questions']\n",
+    "relevant_docs = train_dataset['relevant_contexts']\n",
+    "\n",
+    "examples = []\n",
+    "for query_id, query in queries.items():\n",
+    "    doc_id = relevant_docs[query_id][0]\n",
+    "    text = corpus[doc_id]\n",
+    "    example = InputExample(texts=[query, text])\n",
+    "    examples.append(example)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = DataLoader(\n",
+    "    examples, batch_size=BATCH_SIZE\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss\n",
+    "\n",
+    "matryoshka_dimensions = [768, 512, 256, 128, 64]\n",
+    "inner_train_loss = MultipleNegativesRankingLoss(model)\n",
+    "train_loss = MatryoshkaLoss(\n",
+    "    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sentence_transformers.evaluation import InformationRetrievalEvaluator\n",
+    "\n",
+    "corpus = val_dataset['corpus']\n",
+    "queries = val_dataset['questions']\n",
+    "relevant_docs = val_dataset['relevant_contexts']\n",
+    "\n",
+    "evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EPOCHS = 5"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<button onClick=\"this.nextSibling.style.display='block';this.style.display='none';\">Display W&B run</button><iframe src='https://wandb.ai/dummy/dummy/runs/3hjt799n?jupyter=true' style='border:none;width:100%;height:420px;display:none;'></iframe>"
+      ],
+      "text/plain": [
+       "<wandb.sdk.wandb_run.Run at 0x749b55325d10>"
+      ]
+     },
+     "execution_count": 59,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#!pip install wandb\n",
+    "\n",
+    "import wandb\n",
+    "wandb.init(mode=\"disabled\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "400bc1e49a854008a875534a9d3a50d4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='10' max='10' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [10/10 00:02, Epoch 5/5]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "      <th>Cosine Accuracy@1</th>\n",
+       "      <th>Cosine Accuracy@3</th>\n",
+       "      <th>Cosine Accuracy@5</th>\n",
+       "      <th>Cosine Accuracy@10</th>\n",
+       "      <th>Cosine Precision@1</th>\n",
+       "      <th>Cosine Precision@3</th>\n",
+       "      <th>Cosine Precision@5</th>\n",
+       "      <th>Cosine Precision@10</th>\n",
+       "      <th>Cosine Recall@1</th>\n",
+       "      <th>Cosine Recall@3</th>\n",
+       "      <th>Cosine Recall@5</th>\n",
+       "      <th>Cosine Recall@10</th>\n",
+       "      <th>Cosine Ndcg@10</th>\n",
+       "      <th>Cosine Mrr@10</th>\n",
+       "      <th>Cosine Map@100</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.200000</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.907732</td>\n",
+       "      <td>0.875000</td>\n",
+       "      <td>0.875000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.200000</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.907732</td>\n",
+       "      <td>0.875000</td>\n",
+       "      <td>0.875000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.200000</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.907732</td>\n",
+       "      <td>0.875000</td>\n",
+       "      <td>0.875000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>8</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.200000</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.907732</td>\n",
+       "      <td>0.875000</td>\n",
+       "      <td>0.875000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>10</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>0.333333</td>\n",
+       "      <td>0.200000</td>\n",
+       "      <td>0.100000</td>\n",
+       "      <td>0.750000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.907732</td>\n",
+       "      <td>0.875000</td>\n",
+       "      <td>0.875000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "#commented out for now as want to run whole notebook but not retrain\n",
+    "# warmup_steps = int(len(loader) * EPOCHS * 0.1)\n",
+    "\n",
+    "# model.fit(\n",
+    "#     train_objectives=[(loader, train_loss)],\n",
+    "#     epochs=EPOCHS,\n",
+    "#     warmup_steps=warmup_steps,\n",
+    "#     output_path='models/midterm-compare-arctic-embed-m-ft',\n",
+    "#     show_progress_bar=True,\n",
+    "#     evaluator=evaluator,\n",
+    "#     evaluation_steps=50\n",
+    "# )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#commented out for now as want to run whole notebook but not sending to hub\n",
+    "#model.push_to_hub(f\"{hf_username}/midterm-compare-arctic-embed-m-ft\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of BertModel were not initialized from the model checkpoint at drewgenai/midterm-compare-arctic-embed-m-ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "finetune_embeddings = HuggingFaceEmbeddings(model_name=f\"{hf_username}/midterm-compare-arctic-embed-m-ft\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "###testingabove"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of BertModel were not initialized from the model checkpoint at drewgenai/midterm-compare-arctic-embed-m-ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "ename": "IndexError",
+     "evalue": "list index out of range",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[93], line 17\u001b[0m\n\u001b[1;32m      9\u001b[0m embedding_model \u001b[38;5;241m=\u001b[39m HuggingFaceEmbeddings(model_name\u001b[38;5;241m=\u001b[39mmodel_id)\n\u001b[1;32m     10\u001b[0m \u001b[38;5;66;03m# model_id = \"Snowflake/snowflake-arctic-embed-m\"\u001b[39;00m\n\u001b[1;32m     11\u001b[0m \u001b[38;5;66;03m# embedding_model = HuggingFaceEmbeddings(model_name=model_id)\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;66;03m# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     15\u001b[0m \n\u001b[1;32m     16\u001b[0m \u001b[38;5;66;03m# Load documents into Qdrant\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m qdrant_vectorstore \u001b[38;5;241m=\u001b[39m \u001b[43mQdrant\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_documents\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     18\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdocuments_with_metadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     19\u001b[0m \u001b[43m    \u001b[49m\u001b[43membedding_model\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     20\u001b[0m \u001b[43m    \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m:memory:\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# In-memory for testing\u001b[39;49;00m\n\u001b[1;32m     21\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdocument_comparison\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     22\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m# Create a retriever\u001b[39;00m\n\u001b[1;32m     25\u001b[0m qdrant_retriever \u001b[38;5;241m=\u001b[39m qdrant_vectorstore\u001b[38;5;241m.\u001b[39mas_retriever()\n",
+      "File \u001b[0;32m~/Documents/huggingfacetesting/temptest/.venv/lib/python3.13/site-packages/langchain_core/vectorstores/base.py:852\u001b[0m, in \u001b[0;36mVectorStore.from_documents\u001b[0;34m(cls, documents, embedding, **kwargs)\u001b[0m\n\u001b[1;32m    849\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(ids):\n\u001b[1;32m    850\u001b[0m         kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mids\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m ids\n\u001b[0;32m--> 852\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_texts\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtexts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43membedding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadatas\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadatas\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/Documents/huggingfacetesting/temptest/.venv/lib/python3.13/site-packages/langchain_community/vectorstores/qdrant.py:1337\u001b[0m, in \u001b[0;36mQdrant.from_texts\u001b[0;34m(cls, texts, embedding, metadatas, ids, location, url, port, grpc_port, prefer_grpc, https, api_key, prefix, timeout, host, path, collection_name, distance_func, content_payload_key, metadata_payload_key, vector_name, batch_size, shard_number, replication_factor, write_consistency_factor, on_disk_payload, hnsw_config, optimizers_config, wal_config, quantization_config, init_from, on_disk, force_recreate, **kwargs)\u001b[0m\n\u001b[1;32m   1197\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m   1198\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfrom_texts\u001b[39m(\n\u001b[1;32m   1199\u001b[0m     \u001b[38;5;28mcls\u001b[39m: Type[Qdrant],\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1232\u001b[0m     \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m   1233\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Qdrant:\n\u001b[1;32m   1234\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"Construct Qdrant wrapper from a list of texts.\u001b[39;00m\n\u001b[1;32m   1235\u001b[0m \n\u001b[1;32m   1236\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1335\u001b[0m \u001b[38;5;124;03m            qdrant = Qdrant.from_texts(texts, embeddings, \"localhost\")\u001b[39;00m\n\u001b[1;32m   1336\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1337\u001b[0m     qdrant \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconstruct_instance\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1338\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtexts\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1339\u001b[0m \u001b[43m        \u001b[49m\u001b[43membedding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1340\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlocation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1341\u001b[0m \u001b[43m        \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1342\u001b[0m \u001b[43m        \u001b[49m\u001b[43mport\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1343\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgrpc_port\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1344\u001b[0m \u001b[43m        \u001b[49m\u001b[43mprefer_grpc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1345\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhttps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1346\u001b[0m \u001b[43m        \u001b[49m\u001b[43mapi_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1347\u001b[0m \u001b[43m        \u001b[49m\u001b[43mprefix\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1348\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1349\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhost\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1350\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1351\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1352\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdistance_func\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1353\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcontent_payload_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1354\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmetadata_payload_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1355\u001b[0m \u001b[43m        \u001b[49m\u001b[43mvector_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1356\u001b[0m \u001b[43m        \u001b[49m\u001b[43mshard_number\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1357\u001b[0m \u001b[43m        \u001b[49m\u001b[43mreplication_factor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1358\u001b[0m \u001b[43m        \u001b[49m\u001b[43mwrite_consistency_factor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1359\u001b[0m \u001b[43m        \u001b[49m\u001b[43mon_disk_payload\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1360\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhnsw_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1361\u001b[0m \u001b[43m        \u001b[49m\u001b[43moptimizers_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1362\u001b[0m \u001b[43m        \u001b[49m\u001b[43mwal_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1363\u001b[0m \u001b[43m        \u001b[49m\u001b[43mquantization_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1364\u001b[0m \u001b[43m        \u001b[49m\u001b[43minit_from\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1365\u001b[0m \u001b[43m        \u001b[49m\u001b[43mon_disk\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1366\u001b[0m \u001b[43m        \u001b[49m\u001b[43mforce_recreate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1367\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1368\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1369\u001b[0m     qdrant\u001b[38;5;241m.\u001b[39madd_texts(texts, metadatas, ids, batch_size)\n\u001b[1;32m   1370\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m qdrant\n",
+      "File \u001b[0;32m~/Documents/huggingfacetesting/temptest/.venv/lib/python3.13/site-packages/langchain_community/vectorstores/qdrant.py:1640\u001b[0m, in \u001b[0;36mQdrant.construct_instance\u001b[0;34m(cls, texts, embedding, location, url, port, grpc_port, prefer_grpc, https, api_key, prefix, timeout, host, path, collection_name, distance_func, content_payload_key, metadata_payload_key, vector_name, shard_number, replication_factor, write_consistency_factor, on_disk_payload, hnsw_config, optimizers_config, wal_config, quantization_config, init_from, on_disk, force_recreate, **kwargs)\u001b[0m\n\u001b[1;32m   1638\u001b[0m \u001b[38;5;66;03m# Just do a single quick embedding to get vector size\u001b[39;00m\n\u001b[1;32m   1639\u001b[0m partial_embeddings \u001b[38;5;241m=\u001b[39m embedding\u001b[38;5;241m.\u001b[39membed_documents(texts[:\u001b[38;5;241m1\u001b[39m])\n\u001b[0;32m-> 1640\u001b[0m vector_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[43mpartial_embeddings\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m)\n\u001b[1;32m   1641\u001b[0m collection_name \u001b[38;5;241m=\u001b[39m collection_name \u001b[38;5;129;01mor\u001b[39;00m uuid\u001b[38;5;241m.\u001b[39muuid4()\u001b[38;5;241m.\u001b[39mhex\n\u001b[1;32m   1642\u001b[0m distance_func \u001b[38;5;241m=\u001b[39m distance_func\u001b[38;5;241m.\u001b[39mupper()\n",
+      "\u001b[0;31mIndexError\u001b[0m: list index out of range"
+     ]
+    }
+   ],
+   "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "from langchain.vectorstores import Qdrant\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "\n",
+    "\n",
+    "# Load the SentenceTransformer model\n",
+    "#model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
+    "model_id = f\"{hf_username}/midterm-compare-arctic-embed-m-ft\"  \n",
+    "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
+    "# model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
+    "# embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
+    "# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\n",
+    "# embedding_model = HuggingFaceEmbeddings(model_name=model_id, model_kwargs={\"trust_remote_code\": True})\n",
+    "\n",
+    "\n",
+    "# Load documents into Qdrant\n",
+    "qdrant_vectorstore = Qdrant.from_documents(\n",
+    "    documents_with_metadata,\n",
+    "    embedding_model,\n",
+    "    location=\":memory:\",  # In-memory for testing\n",
+    "    collection_name=\"document_comparison\",\n",
+    ")\n",
+    "\n",
+    "# Create a retriever\n",
+    "qdrant_retriever = qdrant_vectorstore.as_retriever()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "RAG_PROMPT = \"\"\"\n",
+    "CONTEXT:\n",
+    "{context}\n",
+    "\n",
+    "QUERY:\n",
+    "{question}\n",
+    "\n",
+    "You are a helpful assistant. Use the available context to answer the question.\n",
+    "\n",
+    "Return the response in **valid JSON format** with the following structure:\n",
+    "\n",
+    "[\n",
+    "    {{\n",
+    "        \"Derived Description\": \"A short name for the matched concept\",\n",
+    "        \"Protocol_1_Name\": \"Protocol 1 - Matching Element\",\n",
+    "        \"Protocol_2_Name\": \"Protocol 2 - Matching Element\"\n",
+    "    }},\n",
+    "    ...\n",
+    "]\n",
+    "\n",
+    "### Rules:\n",
+    "1. Only output **valid JSON** with no explanations, summaries, or markdown formatting.\n",
+    "2. Ensure each entry in the JSON list represents a single matched data element from the two protocols.\n",
+    "3. If no matching element is found in a protocol, leave it empty (\"\").\n",
+    "4. **Do NOT include headers, explanations, or additional formatting**—only return the raw JSON list.\n",
+    "5. It should include all the elements in the two protocols.\n",
+    "6. If it cannot match the element, create the row and include the protocol it did find and put \"could not match\" in the other protocol column.\n",
+    "\"\"\"\n",
+    "\n",
+    "rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)\n",
+    "\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "#openai_chat_model = ChatOpenAI(model=\"gpt-4o\")\n",
+    "openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
+    "\n",
+    "from operator import itemgetter\n",
+    "from langchain.schema.output_parser import StrOutputParser\n",
+    "\n",
+    "rag_chain = (\n",
+    "    {\"context\": itemgetter(\"question\") | qdrant_retriever, \"question\": itemgetter(\"question\")}\n",
+    "    | rag_prompt | openai_chat_model | StrOutputParser()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question_text = \"\"\"Between these two files containing protocols, can you find the data elements in each that most likely match the element in the other and output a CSV that lists three columns:\n",
+    "\n",
+    "The questions within elements will be similar between the two documents and can be used to match the elements.\n",
+    "\n",
+    "1. Derived description from the two documents describing the index/measure/scale.\n",
+    "2. A column for each standard.\n",
+    "3. In the column for each name/version, the data element used to capture that description.\n",
+    "\n",
+    "There should only be one row for each scale/index/etc.\n",
+    "The description should not be one of the questions but a name that best describes the similar data elements.\"\"\"\n",
+    "\n",
+    "response_text = rag_chain.invoke({\"question\": question_text})\n",
+    "# response = rag_chain.invoke({\"question\": question_text})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ CSV file saved: matching_data_elements.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import pandas as pd\n",
+    "\n",
+    "def parse_rag_output(response_text):\n",
+    "    \"\"\"Extract structured JSON data from the RAG response.\"\"\"\n",
+    "    try:\n",
+    "        structured_data = json.loads(response_text)\n",
+    "\n",
+    "        # Ensure similarity score is always included\n",
+    "        for item in structured_data:\n",
+    "            item.setdefault(\"Similarity Score\", \"N/A\")  # Default if missing\n",
+    "\n",
+    "        return structured_data\n",
+    "    except json.JSONDecodeError:\n",
+    "        print(\"Error: Response is not valid JSON.\")\n",
+    "        return None\n",
+    "\n",
+    "def save_to_csv(data, directory=\"./output\", filename=\"matching_data_elements.csv\"):\n",
+    "    \"\"\"Save structured data to CSV.\"\"\"\n",
+    "    if not data:\n",
+    "        print(\"No data to save.\")\n",
+    "        return\n",
+    "\n",
+    "    file_path = os.path.join(directory, filename)\n",
+    "    df = pd.DataFrame(data, columns=[\"Derived Description\", \"Protocol_1_Name\", \"Protocol_2_Name\"])  # Ensure correct columns\n",
+    "    df.to_csv(file_path, index=False)\n",
+    "    print(f\"✅ CSV file saved: {filename}\")\n",
+    "\n",
+    "# Run the pipeline\n",
+    "structured_output = parse_rag_output(response_text)\n",
+    "save_to_csv(structured_output)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# rag_chain.invoke({\"question\" : \"Based on the types of questions asked under each heading.  can you identify the headings in one document that most closely match the second document.  list them e.g   paincoping/doc1  painstrategy/doc2\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# rag_chain.invoke({\"question\" : \"Based on the types of questions asked under each heading.  can you identify the headings in one document that most closely match the second document.  list them e.g   paincoping/doc1  painstrategy/doc2. these are example headings not the ones in the actual documents.  just list the matches not the rational.  Can you list multiple matches?\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### ragas testing below\n",
+    "#docs = documents_with_metadata\n",
+    "docs = text_loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "\n",
+    "RAG_PROMPT = \"\"\"\\\n",
+    "Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.\n",
+    "\n",
+    "Context:\n",
+    "{context}\n",
+    "\n",
+    "Question:\n",
+    "{question}\n",
+    "\n",
+    "Answer:\n",
+    "\"\"\"\n",
+    "\n",
+    "rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rag_llm =  ChatOpenAI(\n",
+    "    model=\"gpt-4o-mini\",\n",
+    "    temperature=0\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 113,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of BertModel were not initialized from the model checkpoint at drewgenai/midterm-compare-arctic-embed-m-ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "base_model_id = f\"Snowflake/snowflake-arctic-embed-m\"  \n",
+    "base_embedding_model = HuggingFaceEmbeddings(model_name=base_model_id)\n",
+    "\n",
+    "finetune_model_id = f\"{hf_username}/midterm-compare-arctic-embed-m-ft\"  \n",
+    "finetune_embedding_model = HuggingFaceEmbeddings(model_name=finetune_model_id)\n",
+    "\n",
+    "openai_model_id = \"text-embedding-3-small\"\n",
+    "openai_embedding_model = OpenAIEmbeddings(model=openai_model_id)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#from langchain_community.vectorstores import FAISS\n",
+    "\n",
+    "### try qdrant?\n",
+    "\n",
+    "qdrant_vectorstore_base = Qdrant.from_documents(\n",
+    "    docs,\n",
+    "    base_embedding_model,\n",
+    "    location=\":memory:\",  # In-memory for testing\n",
+    "    collection_name=\"document_comparison\",\n",
+    ")\n",
+    "\n",
+    "\n",
+    "base_retriever = qdrant_vectorstore_base.as_retriever(search_kwargs={\"k\": 6})\n",
+    "\n",
+    "qdrant_vectorstore_finetune = Qdrant.from_documents(\n",
+    "    docs,\n",
+    "    finetune_embedding_model,\n",
+    "    location=\":memory:\",  # In-memory for testing\n",
+    "    collection_name=\"document_comparison\",\n",
+    ")\n",
+    "\n",
+    "\n",
+    "finetune_retriever = qdrant_vectorstore_finetune.as_retriever(search_kwargs={\"k\": 6})\n",
+    "\n",
+    "\n",
+    "\n",
+    "qdrant_vectorstore_openai = Qdrant.from_documents(\n",
+    "    docs,\n",
+    "    openai_embedding_model,\n",
+    "    location=\":memory:\",  # In-memory for testing\n",
+    "    collection_name=\"document_comparison\",\n",
+    ")\n",
+    "\n",
+    "\n",
+    "openai_retriever = qdrant_vectorstore_openai.as_retriever(search_kwargs={\"k\": 6})\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# # Create a retriever\n",
+    "# qdrant_retriever = qdrant_vectorstore.as_retriever()\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "# ###\n",
+    "\n",
+    "# base_vectorstore = FAISS.from_documents(training_documents, base_embedding_model)\n",
+    "# base_retriever = base_vectorstore.as_retriever(search_kwargs={\"k\": 6})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.schema.runnable import RunnablePassthrough\n",
+    "\n",
+    "base_rag_chain = (\n",
+    "    {\"context\": itemgetter(\"question\") | base_retriever, \"question\": itemgetter(\"question\")}\n",
+    "    | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n",
+    "    | {\"response\": rag_prompt_template | rag_llm | StrOutputParser(), \"context\": itemgetter(\"context\")}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "finetune_rag_chain = (\n",
+    "    {\"context\": itemgetter(\"question\") | finetune_retriever, \"question\": itemgetter(\"question\")}\n",
+    "    | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n",
+    "    | {\"response\": rag_prompt_template | rag_llm | StrOutputParser(), \"context\": itemgetter(\"context\")}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.schema.runnable import RunnablePassthrough\n",
+    "\n",
+    "openai_rag_chain = (\n",
+    "    {\"context\": itemgetter(\"question\") | openai_retriever, \"question\": itemgetter(\"question\")}\n",
+    "    | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n",
+    "    | {\"response\": rag_prompt_template | rag_llm | StrOutputParser(), \"context\": itemgetter(\"context\")}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ragas.llms import LangchainLLMWrapper\n",
+    "from ragas.embeddings import LangchainEmbeddingsWrapper\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "from langchain_openai import OpenAIEmbeddings\n",
+    "generator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\n",
+    "generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7c3166b3cd08451a9b2d35c0b73581af",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Applying SummaryExtractor:   0%|          | 0/6 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "84fc7afd0ff04c0e8990cb88b9978867",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Applying CustomNodeFilter:   0%|          | 0/7 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Node 77fa3fd5-0ec7-4864-8a9f-fb6df33f64ec does not have a summary. Skipping filtering.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8e6bcaf303d641fa8c48f3dd8f077771",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/20 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4146d76a8f93496d909b6f56f2b99644",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7bf10ce73bf04cdf9c8bb81d5134095f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8c5a3b61bcb94ab19b0478a95b1b43ad",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating Scenarios:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "88fb910b941344ea9b2414c3010fad47",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating Samples:   0%|          | 0/10 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from ragas.testset import TestsetGenerator\n",
+    "\n",
+    "generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)\n",
+    "dataset = generator.generate_with_langchain_docs(docs, testset_size=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>user_input</th>\n",
+       "      <th>reference_contexts</th>\n",
+       "      <th>reference</th>\n",
+       "      <th>synthesizer_name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>How does the Pain Coping Strategy Scale (PCSS-...</td>\n",
+       "      <td>[Linked Psychological &amp; Physical Assessment\\nP...</td>\n",
+       "      <td>The Pain Coping Strategy Scale (PCSS-9) measur...</td>\n",
+       "      <td>single_hop_specifc_query_synthesizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Cud yu pleese explane wut the Pain Coping Stra...</td>\n",
+       "      <td>[Linked Psychological &amp; Physical Assessment\\nP...</td>\n",
+       "      <td>The Pain Coping Strategy Scale (PCSS-9) measur...</td>\n",
+       "      <td>single_hop_specifc_query_synthesizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Wht is the ERI-9 and how does it relate to emo...</td>\n",
+       "      <td>[Financial Stress Index (FSI-6)\\nThe FSI-6 eva...</td>\n",
+       "      <td>The Emotional Regulation Index (ERI-9) is ment...</td>\n",
+       "      <td>single_hop_specifc_query_synthesizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>what cognitive load management scale do</td>\n",
+       "      <td>[Financial Stress Index (FSI-6)\\nThe FSI-6 eva...</td>\n",
+       "      <td>The Cognitive Load Management Scale (CLMS-7) m...</td>\n",
+       "      <td>single_hop_specifc_query_synthesizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>What does the MRI-6 assessment evaluate?</td>\n",
+       "      <td>[The ERI-9 assesses an individual's ability to...</td>\n",
+       "      <td>The MRI-6 evaluates short-term and long-term m...</td>\n",
+       "      <td>single_hop_specifc_query_synthesizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>what scm-6 do for social confidence and public...</td>\n",
+       "      <td>[The ERI-9 assesses an individual's ability to...</td>\n",
+       "      <td>The SCM-6 evaluates levels of confidence in so...</td>\n",
+       "      <td>single_hop_specifc_query_synthesizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>What does the RDMT-6 assess in terms of cognit...</td>\n",
+       "      <td>[Linked Psychological &amp; Physical Assessment\\nC...</td>\n",
+       "      <td>The RDMT-6 evaluates logical reasoning and dec...</td>\n",
+       "      <td>single_hop_specifc_query_synthesizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>What does the CPAI-10 assess in individuals wi...</td>\n",
+       "      <td>[Linked Psychological &amp; Physical Assessment\\nC...</td>\n",
+       "      <td>The CPAI-10 evaluates the strategies people us...</td>\n",
+       "      <td>single_hop_specifc_query_synthesizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>What does the CWT-7 assessment measure in term...</td>\n",
+       "      <td>[I feel confident when making important decisi...</td>\n",
+       "      <td>The CWT-7 evaluates an individual's ability to...</td>\n",
+       "      <td>single_hop_specifc_query_synthesizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>What does the Stamina and Endurance Index (SEI...</td>\n",
+       "      <td>[I feel confident when making important decisi...</td>\n",
+       "      <td>The Stamina and Endurance Index (SEI-8) measur...</td>\n",
+       "      <td>single_hop_specifc_query_synthesizer</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                          user_input  \\\n",
+       "0  How does the Pain Coping Strategy Scale (PCSS-...   \n",
+       "1  Cud yu pleese explane wut the Pain Coping Stra...   \n",
+       "2  Wht is the ERI-9 and how does it relate to emo...   \n",
+       "3            what cognitive load management scale do   \n",
+       "4           What does the MRI-6 assessment evaluate?   \n",
+       "5  what scm-6 do for social confidence and public...   \n",
+       "6  What does the RDMT-6 assess in terms of cognit...   \n",
+       "7  What does the CPAI-10 assess in individuals wi...   \n",
+       "8  What does the CWT-7 assessment measure in term...   \n",
+       "9  What does the Stamina and Endurance Index (SEI...   \n",
+       "\n",
+       "                                  reference_contexts  \\\n",
+       "0  [Linked Psychological & Physical Assessment\\nP...   \n",
+       "1  [Linked Psychological & Physical Assessment\\nP...   \n",
+       "2  [Financial Stress Index (FSI-6)\\nThe FSI-6 eva...   \n",
+       "3  [Financial Stress Index (FSI-6)\\nThe FSI-6 eva...   \n",
+       "4  [The ERI-9 assesses an individual's ability to...   \n",
+       "5  [The ERI-9 assesses an individual's ability to...   \n",
+       "6  [Linked Psychological & Physical Assessment\\nC...   \n",
+       "7  [Linked Psychological & Physical Assessment\\nC...   \n",
+       "8  [I feel confident when making important decisi...   \n",
+       "9  [I feel confident when making important decisi...   \n",
+       "\n",
+       "                                           reference  \\\n",
+       "0  The Pain Coping Strategy Scale (PCSS-9) measur...   \n",
+       "1  The Pain Coping Strategy Scale (PCSS-9) measur...   \n",
+       "2  The Emotional Regulation Index (ERI-9) is ment...   \n",
+       "3  The Cognitive Load Management Scale (CLMS-7) m...   \n",
+       "4  The MRI-6 evaluates short-term and long-term m...   \n",
+       "5  The SCM-6 evaluates levels of confidence in so...   \n",
+       "6  The RDMT-6 evaluates logical reasoning and dec...   \n",
+       "7  The CPAI-10 evaluates the strategies people us...   \n",
+       "8  The CWT-7 evaluates an individual's ability to...   \n",
+       "9  The Stamina and Endurance Index (SEI-8) measur...   \n",
+       "\n",
+       "                       synthesizer_name  \n",
+       "0  single_hop_specifc_query_synthesizer  \n",
+       "1  single_hop_specifc_query_synthesizer  \n",
+       "2  single_hop_specifc_query_synthesizer  \n",
+       "3  single_hop_specifc_query_synthesizer  \n",
+       "4  single_hop_specifc_query_synthesizer  \n",
+       "5  single_hop_specifc_query_synthesizer  \n",
+       "6  single_hop_specifc_query_synthesizer  \n",
+       "7  single_hop_specifc_query_synthesizer  \n",
+       "8  single_hop_specifc_query_synthesizer  \n",
+       "9  single_hop_specifc_query_synthesizer  "
+      ]
+     },
+     "execution_count": 105,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dataset.to_pandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Eval with base model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for test_row in dataset:\n",
+    "  response = base_rag_chain.invoke({\"question\" : test_row.eval_sample.user_input})\n",
+    "  test_row.eval_sample.response = response[\"response\"]\n",
+    "  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response[\"context\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ragas.llms import LangchainLLMWrapper\n",
+    "\n",
+    "evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ragas import EvaluationDataset\n",
+    "\n",
+    "evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "57340d6c46c347e19fecdc4490574a8b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Exception raised in Job[13]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28698, Requested 2725. Please try again in 2.846s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[22]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29211, Requested 2254. Please try again in 2.93s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[19]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29563, Requested 2685. Please try again in 4.496s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[24]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29189, Requested 2555. Please try again in 3.488s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[28]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29993, Requested 2254. Please try again in 4.494s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29502, Requested 2743. Please try again in 4.49s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[30]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28840, Requested 2574. Please try again in 2.828s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[25]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29928, Requested 2511. Please try again in 4.878s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[7]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29823, Requested 2809. Please try again in 5.264s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[31]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29637, Requested 2665. Please try again in 4.604s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[36]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29185, Requested 2560. Please try again in 3.49s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[11]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29749, Requested 1558. Please try again in 2.614s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[5]: TimeoutError()\n",
+      "Exception raised in Job[17]: TimeoutError()\n",
+      "Exception raised in Job[43]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29678, Requested 2514. Please try again in 4.384s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[37]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28940, Requested 2499. Please try again in 2.878s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[40]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28657, Requested 2254. Please try again in 1.822s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'context_recall': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7540, 'answer_relevancy': 0.9481, 'context_entity_recall': 0.8095, 'noise_sensitivity_relevant': 0.1973}"
+      ]
+     },
+     "execution_count": 109,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity\n",
+    "from ragas import evaluate, RunConfig\n",
+    "\n",
+    "custom_run_config = RunConfig(timeout=360)\n",
+    "\n",
+    "result = evaluate(\n",
+    "    dataset=evaluation_dataset,\n",
+    "    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],\n",
+    "    llm=evaluator_llm,\n",
+    "    run_config=custom_run_config\n",
+    ")\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Evaluate the Fine tuned.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for test_row in dataset:\n",
+    "  response = finetune_rag_chain.invoke({\"question\" : test_row.eval_sample.user_input})\n",
+    "  test_row.eval_sample.response = response[\"response\"]\n",
+    "  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response[\"context\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 112,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "758cb2b2b6df49e88c88b1fca6c09f3c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Exception raised in Job[22]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28950, Requested 2254. Please try again in 2.408s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[16]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28949, Requested 2254. Please try again in 2.406s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[19]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28567, Requested 2751. Please try again in 2.636s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[25]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28831, Requested 2511. Please try again in 2.684s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[28]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29242, Requested 2254. Please try again in 2.992s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[24]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29683, Requested 2555. Please try again in 4.476s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[11]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29672, Requested 1515. Please try again in 2.374s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29901, Requested 2743. Please try again in 5.288s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[30]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29651, Requested 2574. Please try again in 4.45s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[7]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29659, Requested 2771. Please try again in 4.86s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[34]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28744, Requested 2265. Please try again in 2.018s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[31]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29754, Requested 2665. Please try again in 4.838s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[5]: TimeoutError()\n",
+      "Exception raised in Job[36]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29775, Requested 2560. Please try again in 4.67s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[17]: TimeoutError()\n",
+      "Exception raised in Job[23]: TimeoutError()\n",
+      "Exception raised in Job[40]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28967, Requested 2254. Please try again in 2.442s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[46]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28976, Requested 2250. Please try again in 2.452s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[37]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28735, Requested 2499. Please try again in 2.468s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'context_recall': 1.0000, 'faithfulness': 0.8500, 'factual_correctness': 0.7220, 'answer_relevancy': 0.9481, 'context_entity_recall': 0.7917, 'noise_sensitivity_relevant': 0.1111}"
+      ]
+     },
+     "execution_count": 112,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result = evaluate(\n",
+    "    dataset=evaluation_dataset,\n",
+    "    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],\n",
+    "    llm=evaluator_llm,\n",
+    "    run_config=custom_run_config\n",
+    ")\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Evaluate the openai model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 116,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for test_row in dataset:\n",
+    "  response = openai_rag_chain.invoke({\"question\" : test_row.eval_sample.user_input})\n",
+    "  test_row.eval_sample.response = response[\"response\"]\n",
+    "  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response[\"context\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 118,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a3f59e7e78294492a701763a859d6239",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Exception raised in Job[30]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28587, Requested 2574. Please try again in 2.322s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[25]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29460, Requested 2782. Please try again in 4.484s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29365, Requested 2991. Please try again in 4.712s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[24]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29067, Requested 2826. Please try again in 3.786s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[13]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28945, Requested 2968. Please try again in 3.826s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[22]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29841, Requested 2525. Please try again in 4.732s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[19]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29512, Requested 2895. Please try again in 4.814s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[11]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29581, Requested 1650. Please try again in 2.462s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[7]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29318, Requested 3175. Please try again in 4.986s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[28]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28799, Requested 2525. Please try again in 2.648s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[5]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29787, Requested 1465. Please try again in 2.504s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[34]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29638, Requested 2265. Please try again in 3.805s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[31]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29242, Requested 2736. Please try again in 3.956s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
+      "Exception raised in Job[35]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29647, Requested 1516. Please try again in 2.326s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'context_recall': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7540, 'answer_relevancy': 0.9463, 'context_entity_recall': 0.8095, 'noise_sensitivity_relevant': 0.3095}"
+      ]
+     },
+     "execution_count": 118,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result = evaluate(\n",
+    "    dataset=evaluation_dataset,\n",
+    "    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],\n",
+    "    llm=evaluator_llm,\n",
+    "    run_config=custom_run_config\n",
+    ")\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "Base model evaluation\n",
+    "{'context_recall': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7540, 'answer_relevancy': 0.9481, 'context_entity_recall': 0.8095, 'noise_sensitivity_relevant': 0.1973}\n",
+    "\n",
+    "Finetuned model\n",
+    "{'context_recall': 1.0000, 'faithfulness': 0.8500, 'factual_correctness': 0.7220, 'answer_relevancy': 0.9481, 'context_entity_recall': 0.7917, 'noise_sensitivity_relevant': 0.1111}\n",
+    "\n",
+    "\n",
+    "Openai model\n",
+    "{'context_recall': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7540, 'answer_relevancy': 0.9463, 'context_entity_recall': 0.8095, 'noise_sensitivity_relevant': 0.3095}\n",
+    "\n",
+    "\n",
+    "\n",
+    "Base snowflake model and OpenAI are very similar with the openai model performing slightly better for noise sensitivity.\n",
+    "The finetuned snowflak model perform does not perform better in most case though it reduces noise sensitivity."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+# Get a distribution that has uv already installed
+FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim
+# Add user - this is the user that will run the app
+# If you do not set user, the app will run as root (undesirable)
+RUN useradd -m -u 1000 user
+USER user
+# Set the home directory and path
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+ENV UVICORN_WS_PROTOCOL=websockets
+# Set the working directory
+WORKDIR $HOME/app
+# Copy the app to the container
+COPY --chown=user . $HOME/app
+# Install the dependencies
+# RUN uv sync --frozen
+RUN uv sync
+# Expose the port
+EXPOSE 7860
+# Run the app
+CMD ["uv", "run", "chainlit", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: midterm_poc
+emoji: 🌖
+colorFrom: gray
+colorTo: green
+sdk: docker
+pinned: false
+short_description: midterm POC
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import os
+import shutil
+import json
+import pandas as pd
+import chainlit as cl
+from dotenv import load_dotenv
+from langchain_core.documents import Document
+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_community.vectorstores import Qdrant
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_core.output_parsers import StrOutputParser
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
+from langgraph.graph import START, StateGraph
+from langchain.tools import tool
+from langchain.schema import HumanMessage
+from typing_extensions import List, TypedDict
+from operator import itemgetter
+# Load environment variables
+load_dotenv()
+# Define paths
+UPLOAD_PATH = "upload/"
+OUTPUT_PATH = "output/"
+os.makedirs(UPLOAD_PATH, exist_ok=True)
+os.makedirs(OUTPUT_PATH, exist_ok=True)
+# Initialize embeddings model
+model_id = "Snowflake/snowflake-arctic-embed-m"
+embedding_model = HuggingFaceEmbeddings(model_name=model_id)
+# Define semantic chunker
+semantic_splitter = SemanticChunker(embedding_model)
+# Initialize LLM
+llm = ChatOpenAI(model="gpt-4o-mini")
+# Define RAG prompt
+export_prompt = """
+CONTEXT:
+{context}
+QUERY:
+{question}
+You are a helpful assistant. Use the available context to answer the question.
+Between these two files containing protocols, identify and match **entire assessment sections** based on conceptual similarity. Do NOT match individual questions.
+### **Output Format:**
+Return the response in **valid JSON format** structured as a list of dictionaries, where each dictionary contains:
+[
+    {{
+        "Derived Description": "A short name for the matched concept",
+        "Protocol_1": "Protocol 1 - Matching Element",
+        "Protocol_2": "Protocol 2 - Matching Element"
+    }},
+    ...
+]
+### **Example Output:**
+[
+    {{
+        "Derived Description": "Pain Coping Strategies",
+        "Protocol_1": "Pain Coping Strategy Scale (PCSS-9)",
+        "Protocol_2": "Chronic Pain Adjustment Index (CPAI-10)"
+    }},
+    {{
+        "Derived Description": "Work Stress and Fatigue",
+        "Protocol_1": "Work-Related Stress Scale (WRSS-8)",
+        "Protocol_2": "Occupational Fatigue Index (OFI-7)"
+    }},
+    ...
+]
+### Rules:
+1. Only output **valid JSON** with no explanations, summaries, or markdown formatting.
+2. Ensure each entry in the JSON list represents a single matched data element from the two protocols.
+3. If no matching element is found in a protocol, leave it empty ("").
+4. **Do NOT include headers, explanations, or additional formatting**—only return the raw JSON list.
+5. It should include all the elements in the two protocols.
+6. If it cannot match the element, create the row and include the protocol it did find and put "could not match" in the other protocol column.
+7. protocol should be the between
+"""
+compare_export_prompt = ChatPromptTemplate.from_template(export_prompt)
+QUERY_PROMPT = """
+You are a helpful assistant. Use the available context to answer the question concisely and informatively.
+CONTEXT:
+{context}
+QUERY:
+{question}
+Provide a natural-language response using the given information. If you do not know the answer, say so.
+"""
+query_prompt = ChatPromptTemplate.from_template(QUERY_PROMPT)
+@tool
+def document_query_tool(question: str) -> str:
+    """Retrieves relevant document sections and answers questions based on the uploaded documents."""
+    retriever = cl.user_session.get("qdrant_retriever")
+    if not retriever:
+        return "Error: No documents available for retrieval. Please upload documents first."
+    # Retrieve context from the vector database
+    retrieved_docs = retriever.invoke(question)
+    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
+    # Generate response using the natural query prompt
+    messages = query_prompt.format_messages(question=question, context=docs_content)
+    response = llm.invoke(messages)
+    return {
+        "messages": [HumanMessage(content=response.content)],
+        "context": retrieved_docs
+    }
+@tool
+def document_comparison_tool(question: str) -> str:
+    """Compares the two uploaded documents, identifies matched elements, exports them as JSON, formats into CSV, and provides a download link."""
+    # Retrieve the vector database retriever
+    retriever = cl.user_session.get("qdrant_retriever")
+    if not retriever:
+        return "Error: No documents available for retrieval. Please upload two PDF files first."
+    # Process query using RAG
+    rag_chain = (
+        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
+        | compare_export_prompt | llm | StrOutputParser()
+    )
+    response_text = rag_chain.invoke({"question": question})
+    # Parse response and save as CSV
+    try:
+        structured_data = json.loads(response_text)
+        if not structured_data:
+            return "Error: No matched elements found."
+        # Define output file path
+        file_path = os.path.join(OUTPUT_PATH, "comparison_results.csv")
+        # Save to CSV
+        df = pd.DataFrame(structured_data, columns=["Derived Description", "Protocol_1", "Protocol_2"])
+        df.to_csv(file_path, index=False)
+        return file_path  # Return path to the CSV file
+    except json.JSONDecodeError:
+        return "Error: Response is not valid JSON."
+tool_belt = [document_query_tool, document_comparison_tool]
+model = ChatOpenAI(model="gpt-4o", temperature=0)
+model = model.bind_tools(tool_belt)
+async def process_files(files: list[cl.File]):
+    documents_with_metadata = []
+    for file in files:
+        file_path = os.path.join(UPLOAD_PATH, file.name)
+        shutil.copyfile(file.path, file_path)
+        loader = PyMuPDFLoader(file_path)
+        documents = loader.load()
+        for doc in documents:
+            source_name = file.name
+            chunks = semantic_splitter.split_text(doc.page_content)
+            for chunk in chunks:
+                doc_chunk = Document(page_content=chunk, metadata={"source": source_name})
+                documents_with_metadata.append(doc_chunk)
+    if documents_with_metadata:
+        qdrant_vectorstore = Qdrant.from_documents(
+            documents_with_metadata,
+            embedding_model,
+            location=":memory:",
+            collection_name="document_comparison",
+        )
+        return qdrant_vectorstore.as_retriever()
+    return None
+@cl.on_chat_start
+async def start():
+    cl.user_session.set("qdrant_retriever", None)
+    files = await cl.AskFileMessage(
+        content="Please upload **two PDF files** for comparison:",
+        accept=["application/pdf"],
+        max_files=2
+    ).send()
+    if len(files) != 2:
+        await cl.Message("Error: You must upload exactly two PDF files.").send()
+        return
+    retriever = await process_files(files)
+    if retriever:
+        cl.user_session.set("qdrant_retriever", retriever)
+        await cl.Message("Files uploaded and processed successfully! You can now enter your query.").send()
+    else:
+        await cl.Message("Error: Unable to process files. Please try again.").send()
+@cl.on_message
+async def handle_message(message: cl.Message):
+    user_input = message.content.lower()
+    # If the user asks for a comparison, run the document_comparison_tool
+    if "compare" in user_input or "export" in user_input:
+        file_path = document_comparison_tool.invoke(user_input)
+        if file_path and file_path.endswith(".csv"):
+            await cl.Message(
+                content="Comparison complete! Download the CSV below:",
+                elements=[cl.File(name="comparison_results.csv", path=file_path, display="inline")],
+            ).send()
+        else:
+            await cl.Message(file_path).send()
+    else:
+        response_text = document_query_tool.invoke(user_input)
+        await cl.Message(response_text["messages"][0].content).send()

chainlit.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Welcome to Chat with Your Text File
2	+ With this application, you can compare uploaded text files

example_files/florida_protocol.pdf ADDED Viewed

Binary file (3.97 kB). View file

example_files/matching_data_elements.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+Derived Description,Protocol_1,Protocol_2
+Pain Coping Strategies,Pain Coping Strategy Scale (PCSS-9),Pain Management Techniques
+Work Stress Assessment,Work-Related Stress Scale (WRSS-8),Occupational Fatigue Index (OFI-7)
+Decision-Making Confidence,Decision-Making Confidence Scale (DMCS-6),Rational Decision-Making Test (RDMT-6)
+Cognitive Task Management,Cognitive Load and Task Management,Cognitive and Emotional Resilience
+Emotional Resilience and Regulation,Emotional Resilience Score (ERS-9),Emotional Regulation Index (ERI-9)
+Social Engagement and Communication,Public Speaking and Social Engagement (PSSE-6),could not match

example_files/wyoming_protocol.pdf ADDED Viewed

Binary file (4.36 kB). View file

pyproject.toml ADDED Viewed

	@@ -0,0 +1,57 @@

+[project]
+name = "midterm_poc"
+version = "0.1.0"
+description = "midterm POC huggingface project"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "chainlit",
+    "langchain",
+    "langchain_community",
+    "tqdm",
+    "PyMuPDF",
+    "openai>=1.59.9",
+    "pypdf2>=3.0.1",
+    "websockets",
+    "qdrant-client",
+    "langchain",
+    "langchain-community",
+    "langchain-openai",
+    "unstructured",
+    "pymupdf",
+    "qdrant-client",
+    "langgraph",
+    "langchain-core",
+    "langchain-openai",
+    "langchain-community",
+    "ragas",
+    "langchain_experimental",
+    ###review
+    ### cleanup
+    "langchain-core==0.3.31",
+    "langchain==0.3.15",
+    "langchain-community==0.3.15",
+    "langchain-openai==0.3.1",
+    "langchain-qdrant==0.2.0",
+    "langchain-text-splitters>=0.3.5",
+    "langchain-huggingface==0.1.2",
+    #"langgraph>=0.2.67",
+    "langsmith>=0.3.1",
+    "lxml>=5.3.0",
+    ###notebook
+    "ipykernel",
+    "ipywidgets",
+    "IProgress",
+    "huggingface_hub",
+    "wandb",
+    "transformers",
+    "accelerate",
+    "torch",
+    #### ragas
+    #"ragas==0.2.10"
+    #"FAISS"
+    #remove only used for testing
+    "cohere",
+    "langchain_cohere",
+    "arxiv"
+]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff