drewgenai commited on
Commit
c25f92a
·
1 Parent(s): d35bb79
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ .chainlit/
3
+ .venv/
4
+ .env
5
+ /output/
6
+ /upload/
7
+ *.jsonl
8
+ /models/
9
+ *z*.py
01-cleanragcsv.ipynb ADDED
@@ -0,0 +1,686 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Requirement already satisfied: nest_asyncio in ./.venv/lib/python3.13/site-packages (1.6.0)\n",
13
+ "Requirement already satisfied: langchain_openai in ./.venv/lib/python3.13/site-packages (0.3.6)\n",
14
+ "Requirement already satisfied: langchain_huggingface in ./.venv/lib/python3.13/site-packages (0.1.2)\n",
15
+ "Requirement already satisfied: langchain_core in ./.venv/lib/python3.13/site-packages (0.3.37)\n",
16
+ "Requirement already satisfied: langchain in ./.venv/lib/python3.13/site-packages (0.3.19)\n",
17
+ "Requirement already satisfied: langchain_community in ./.venv/lib/python3.13/site-packages (0.3.18)\n",
18
+ "Requirement already satisfied: langchain-text-splitters in ./.venv/lib/python3.13/site-packages (0.3.6)\n",
19
+ "Requirement already satisfied: faiss-cpu in ./.venv/lib/python3.13/site-packages (1.10.0)\n",
20
+ "Requirement already satisfied: python-pptx==1.0.2 in ./.venv/lib/python3.13/site-packages (1.0.2)\n",
21
+ "Requirement already satisfied: nltk==3.9.1 in ./.venv/lib/python3.13/site-packages (3.9.1)\n",
22
+ "Requirement already satisfied: pymupdf in ./.venv/lib/python3.13/site-packages (1.25.3)\n",
23
+ "Requirement already satisfied: beautifulsoup4 in ./.venv/lib/python3.13/site-packages (4.13.3)\n",
24
+ "Requirement already satisfied: lxml in ./.venv/lib/python3.13/site-packages (5.3.1)\n",
25
+ "Requirement already satisfied: sentence-transformers in ./.venv/lib/python3.13/site-packages (3.4.1)\n",
26
+ "Requirement already satisfied: IProgress in ./.venv/lib/python3.13/site-packages (0.4)\n",
27
+ "Requirement already satisfied: huggingface_hub in ./.venv/lib/python3.13/site-packages (0.29.1)\n",
28
+ "Requirement already satisfied: ipywidgets in ./.venv/lib/python3.13/site-packages (8.1.5)\n",
29
+ "Requirement already satisfied: qdrant-client in ./.venv/lib/python3.13/site-packages (1.13.2)\n",
30
+ "Requirement already satisfied: Pillow>=3.3.2 in ./.venv/lib/python3.13/site-packages (from python-pptx==1.0.2) (11.1.0)\n",
31
+ "Requirement already satisfied: XlsxWriter>=0.5.7 in ./.venv/lib/python3.13/site-packages (from python-pptx==1.0.2) (3.2.2)\n",
32
+ "Requirement already satisfied: typing-extensions>=4.9.0 in ./.venv/lib/python3.13/site-packages (from python-pptx==1.0.2) (4.12.2)\n",
33
+ "Requirement already satisfied: click in ./.venv/lib/python3.13/site-packages (from nltk==3.9.1) (8.1.8)\n",
34
+ "Requirement already satisfied: joblib in ./.venv/lib/python3.13/site-packages (from nltk==3.9.1) (1.4.2)\n",
35
+ "Requirement already satisfied: regex>=2021.8.3 in ./.venv/lib/python3.13/site-packages (from nltk==3.9.1) (2024.11.6)\n",
36
+ "Requirement already satisfied: tqdm in ./.venv/lib/python3.13/site-packages (from nltk==3.9.1) (4.67.1)\n",
37
+ "Requirement already satisfied: openai<2.0.0,>=1.58.1 in ./.venv/lib/python3.13/site-packages (from langchain_openai) (1.63.2)\n",
38
+ "Requirement already satisfied: tiktoken<1,>=0.7 in ./.venv/lib/python3.13/site-packages (from langchain_openai) (0.9.0)\n",
39
+ "Requirement already satisfied: tokenizers>=0.19.1 in ./.venv/lib/python3.13/site-packages (from langchain_huggingface) (0.21.0)\n",
40
+ "Requirement already satisfied: transformers>=4.39.0 in ./.venv/lib/python3.13/site-packages (from langchain_huggingface) (4.49.0)\n",
41
+ "Requirement already satisfied: langsmith<0.4,>=0.1.125 in ./.venv/lib/python3.13/site-packages (from langchain_core) (0.3.10)\n",
42
+ "Requirement already satisfied: tenacity!=8.4.0,<10.0.0,>=8.1.0 in ./.venv/lib/python3.13/site-packages (from langchain_core) (9.0.0)\n",
43
+ "Requirement already satisfied: jsonpatch<2.0,>=1.33 in ./.venv/lib/python3.13/site-packages (from langchain_core) (1.33)\n",
44
+ "Requirement already satisfied: PyYAML>=5.3 in ./.venv/lib/python3.13/site-packages (from langchain_core) (6.0.2)\n",
45
+ "Requirement already satisfied: packaging<25,>=23.2 in ./.venv/lib/python3.13/site-packages (from langchain_core) (24.2)\n",
46
+ "Requirement already satisfied: pydantic<3.0.0,>=2.7.4 in ./.venv/lib/python3.13/site-packages (from langchain_core) (2.10.6)\n",
47
+ "Requirement already satisfied: SQLAlchemy<3,>=1.4 in ./.venv/lib/python3.13/site-packages (from langchain) (2.0.38)\n",
48
+ "Requirement already satisfied: requests<3,>=2 in ./.venv/lib/python3.13/site-packages (from langchain) (2.32.3)\n",
49
+ "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in ./.venv/lib/python3.13/site-packages (from langchain) (3.11.12)\n",
50
+ "Requirement already satisfied: numpy<3,>=1.26.2 in ./.venv/lib/python3.13/site-packages (from langchain) (2.2.3)\n",
51
+ "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in ./.venv/lib/python3.13/site-packages (from langchain_community) (0.6.7)\n",
52
+ "Requirement already satisfied: pydantic-settings<3.0.0,>=2.4.0 in ./.venv/lib/python3.13/site-packages (from langchain_community) (2.8.0)\n",
53
+ "Requirement already satisfied: httpx-sse<1.0.0,>=0.4.0 in ./.venv/lib/python3.13/site-packages (from langchain_community) (0.4.0)\n",
54
+ "Requirement already satisfied: soupsieve>1.2 in ./.venv/lib/python3.13/site-packages (from beautifulsoup4) (2.6)\n",
55
+ "Requirement already satisfied: torch>=1.11.0 in ./.venv/lib/python3.13/site-packages (from sentence-transformers) (2.6.0)\n",
56
+ "Requirement already satisfied: scikit-learn in ./.venv/lib/python3.13/site-packages (from sentence-transformers) (1.6.1)\n",
57
+ "Requirement already satisfied: scipy in ./.venv/lib/python3.13/site-packages (from sentence-transformers) (1.15.2)\n",
58
+ "Requirement already satisfied: six in ./.venv/lib/python3.13/site-packages (from IProgress) (1.17.0)\n",
59
+ "Requirement already satisfied: filelock in ./.venv/lib/python3.13/site-packages (from huggingface_hub) (3.17.0)\n",
60
+ "Requirement already satisfied: fsspec>=2023.5.0 in ./.venv/lib/python3.13/site-packages (from huggingface_hub) (2024.12.0)\n",
61
+ "Requirement already satisfied: comm>=0.1.3 in ./.venv/lib/python3.13/site-packages (from ipywidgets) (0.2.2)\n",
62
+ "Requirement already satisfied: ipython>=6.1.0 in ./.venv/lib/python3.13/site-packages (from ipywidgets) (8.32.0)\n",
63
+ "Requirement already satisfied: traitlets>=4.3.1 in ./.venv/lib/python3.13/site-packages (from ipywidgets) (5.14.3)\n",
64
+ "Requirement already satisfied: widgetsnbextension~=4.0.12 in ./.venv/lib/python3.13/site-packages (from ipywidgets) (4.0.13)\n",
65
+ "Requirement already satisfied: jupyterlab-widgets~=3.0.12 in ./.venv/lib/python3.13/site-packages (from ipywidgets) (3.0.13)\n",
66
+ "Requirement already satisfied: grpcio>=1.41.0 in ./.venv/lib/python3.13/site-packages (from qdrant-client) (1.70.0)\n",
67
+ "Requirement already satisfied: grpcio-tools>=1.41.0 in ./.venv/lib/python3.13/site-packages (from qdrant-client) (1.70.0)\n",
68
+ "Requirement already satisfied: httpx>=0.20.0 in ./.venv/lib/python3.13/site-packages (from httpx[http2]>=0.20.0->qdrant-client) (0.28.1)\n",
69
+ "Requirement already satisfied: portalocker<3.0.0,>=2.7.0 in ./.venv/lib/python3.13/site-packages (from qdrant-client) (2.10.1)\n",
70
+ "Requirement already satisfied: urllib3<3,>=1.26.14 in ./.venv/lib/python3.13/site-packages (from qdrant-client) (2.3.0)\n",
71
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (2.4.6)\n",
72
+ "Requirement already satisfied: aiosignal>=1.1.2 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.2)\n",
73
+ "Requirement already satisfied: attrs>=17.3.0 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (25.1.0)\n",
74
+ "Requirement already satisfied: frozenlist>=1.1.1 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.5.0)\n",
75
+ "Requirement already satisfied: multidict<7.0,>=4.5 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.1.0)\n",
76
+ "Requirement already satisfied: propcache>=0.2.0 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (0.3.0)\n",
77
+ "Requirement already satisfied: yarl<2.0,>=1.17.0 in ./.venv/lib/python3.13/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.18.3)\n",
78
+ "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in ./.venv/lib/python3.13/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain_community) (3.26.1)\n",
79
+ "Requirement already satisfied: typing-inspect<1,>=0.4.0 in ./.venv/lib/python3.13/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain_community) (0.9.0)\n",
80
+ "Requirement already satisfied: protobuf<6.0dev,>=5.26.1 in ./.venv/lib/python3.13/site-packages (from grpcio-tools>=1.41.0->qdrant-client) (5.29.3)\n",
81
+ "Requirement already satisfied: setuptools in ./.venv/lib/python3.13/site-packages (from grpcio-tools>=1.41.0->qdrant-client) (75.8.0)\n",
82
+ "Requirement already satisfied: anyio in ./.venv/lib/python3.13/site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (4.8.0)\n",
83
+ "Requirement already satisfied: certifi in ./.venv/lib/python3.13/site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (2025.1.31)\n",
84
+ "Requirement already satisfied: httpcore==1.* in ./.venv/lib/python3.13/site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (1.0.7)\n",
85
+ "Requirement already satisfied: idna in ./.venv/lib/python3.13/site-packages (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (3.10)\n",
86
+ "Requirement already satisfied: h11<0.15,>=0.13 in ./.venv/lib/python3.13/site-packages (from httpcore==1.*->httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant-client) (0.14.0)\n",
87
+ "Requirement already satisfied: h2<5,>=3 in ./.venv/lib/python3.13/site-packages (from httpx[http2]>=0.20.0->qdrant-client) (4.2.0)\n",
88
+ "Requirement already satisfied: decorator in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (5.2.1)\n",
89
+ "Requirement already satisfied: jedi>=0.16 in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (0.19.2)\n",
90
+ "Requirement already satisfied: matplotlib-inline in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (0.1.7)\n",
91
+ "Requirement already satisfied: pexpect>4.3 in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (4.9.0)\n",
92
+ "Requirement already satisfied: prompt_toolkit<3.1.0,>=3.0.41 in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.50)\n",
93
+ "Requirement already satisfied: pygments>=2.4.0 in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (2.19.1)\n",
94
+ "Requirement already satisfied: stack_data in ./.venv/lib/python3.13/site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)\n",
95
+ "Requirement already satisfied: jsonpointer>=1.9 in ./.venv/lib/python3.13/site-packages (from jsonpatch<2.0,>=1.33->langchain_core) (3.0.0)\n",
96
+ "Requirement already satisfied: orjson<4.0.0,>=3.9.14 in ./.venv/lib/python3.13/site-packages (from langsmith<0.4,>=0.1.125->langchain_core) (3.10.15)\n",
97
+ "Requirement already satisfied: requests-toolbelt<2.0.0,>=1.0.0 in ./.venv/lib/python3.13/site-packages (from langsmith<0.4,>=0.1.125->langchain_core) (1.0.0)\n",
98
+ "Requirement already satisfied: zstandard<0.24.0,>=0.23.0 in ./.venv/lib/python3.13/site-packages (from langsmith<0.4,>=0.1.125->langchain_core) (0.23.0)\n",
99
+ "Requirement already satisfied: distro<2,>=1.7.0 in ./.venv/lib/python3.13/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (1.9.0)\n",
100
+ "Requirement already satisfied: jiter<1,>=0.4.0 in ./.venv/lib/python3.13/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (0.8.2)\n",
101
+ "Requirement already satisfied: sniffio in ./.venv/lib/python3.13/site-packages (from openai<2.0.0,>=1.58.1->langchain_openai) (1.3.1)\n",
102
+ "Requirement already satisfied: annotated-types>=0.6.0 in ./.venv/lib/python3.13/site-packages (from pydantic<3.0.0,>=2.7.4->langchain_core) (0.7.0)\n",
103
+ "Requirement already satisfied: pydantic-core==2.27.2 in ./.venv/lib/python3.13/site-packages (from pydantic<3.0.0,>=2.7.4->langchain_core) (2.27.2)\n",
104
+ "Requirement already satisfied: python-dotenv>=0.21.0 in ./.venv/lib/python3.13/site-packages (from pydantic-settings<3.0.0,>=2.4.0->langchain_community) (1.0.1)\n",
105
+ "Requirement already satisfied: charset-normalizer<4,>=2 in ./.venv/lib/python3.13/site-packages (from requests<3,>=2->langchain) (3.4.1)\n",
106
+ "Requirement already satisfied: greenlet!=0.4.17 in ./.venv/lib/python3.13/site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.1.1)\n",
107
+ "Requirement already satisfied: networkx in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (3.4.2)\n",
108
+ "Requirement already satisfied: jinja2 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (3.1.5)\n",
109
+ "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
110
+ "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
111
+ "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
112
+ "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (9.1.0.70)\n",
113
+ "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.4.5.8)\n",
114
+ "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (11.2.1.3)\n",
115
+ "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (10.3.5.147)\n",
116
+ "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (11.6.1.9)\n",
117
+ "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.3.1.170)\n",
118
+ "Requirement already satisfied: nvidia-cusparselt-cu12==0.6.2 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (0.6.2)\n",
119
+ "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (2.21.5)\n",
120
+ "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
121
+ "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (12.4.127)\n",
122
+ "Requirement already satisfied: triton==3.2.0 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (3.2.0)\n",
123
+ "Requirement already satisfied: sympy==1.13.1 in ./.venv/lib/python3.13/site-packages (from torch>=1.11.0->sentence-transformers) (1.13.1)\n",
124
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in ./.venv/lib/python3.13/site-packages (from sympy==1.13.1->torch>=1.11.0->sentence-transformers) (1.3.0)\n",
125
+ "Requirement already satisfied: safetensors>=0.4.1 in ./.venv/lib/python3.13/site-packages (from transformers>=4.39.0->langchain_huggingface) (0.5.2)\n",
126
+ "Requirement already satisfied: threadpoolctl>=3.1.0 in ./.venv/lib/python3.13/site-packages (from scikit-learn->sentence-transformers) (3.5.0)\n",
127
+ "Requirement already satisfied: hyperframe<7,>=6.1 in ./.venv/lib/python3.13/site-packages (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client) (6.1.0)\n",
128
+ "Requirement already satisfied: hpack<5,>=4.1 in ./.venv/lib/python3.13/site-packages (from h2<5,>=3->httpx[http2]>=0.20.0->qdrant-client) (4.1.0)\n",
129
+ "Requirement already satisfied: parso<0.9.0,>=0.8.4 in ./.venv/lib/python3.13/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.4)\n",
130
+ "Requirement already satisfied: ptyprocess>=0.5 in ./.venv/lib/python3.13/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)\n",
131
+ "Requirement already satisfied: wcwidth in ./.venv/lib/python3.13/site-packages (from prompt_toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.13)\n",
132
+ "Requirement already satisfied: mypy-extensions>=0.3.0 in ./.venv/lib/python3.13/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain_community) (1.0.0)\n",
133
+ "Requirement already satisfied: MarkupSafe>=2.0 in ./.venv/lib/python3.13/site-packages (from jinja2->torch>=1.11.0->sentence-transformers) (3.0.2)\n",
134
+ "Requirement already satisfied: executing>=1.2.0 in ./.venv/lib/python3.13/site-packages (from stack_data->ipython>=6.1.0->ipywidgets) (2.2.0)\n",
135
+ "Requirement already satisfied: asttokens>=2.1.0 in ./.venv/lib/python3.13/site-packages (from stack_data->ipython>=6.1.0->ipywidgets) (3.0.0)\n",
136
+ "Requirement already satisfied: pure-eval in ./.venv/lib/python3.13/site-packages (from stack_data->ipython>=6.1.0->ipywidgets) (0.2.3)\n"
137
+ ]
138
+ }
139
+ ],
140
+ "source": [
141
+ "# !pip install nest_asyncio \\\n",
142
+ "# langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters \\\n",
143
+ "# python-pptx==1.0.2 nltk==3.9.1 pymupdf lxml \\\n",
144
+ "# sentence-transformers IProgress \\\n",
145
+ "# huggingface_hub ipywidgets \\\n",
146
+ "# qdrant-client"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": 1,
152
+ "metadata": {},
153
+ "outputs": [],
154
+ "source": [
155
+ "\n",
156
+ "import nest_asyncio\n",
157
+ "\n",
158
+ "nest_asyncio.apply()"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": 2,
164
+ "metadata": {},
165
+ "outputs": [],
166
+ "source": [
167
+ "import os\n",
168
+ "import getpass\n",
169
+ "\n",
170
+ "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter Your OpenAI API Key: \")"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 3,
176
+ "metadata": {},
177
+ "outputs": [],
178
+ "source": [
179
+ "hf_username = getpass.getpass(\"Enter Your Hugging Face Username: \")\n"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": 4,
185
+ "metadata": {},
186
+ "outputs": [
187
+ {
188
+ "data": {
189
+ "application/vnd.jupyter.widget-view+json": {
190
+ "model_id": "a5c203d394cb4c1d933c1af73ff1c112",
191
+ "version_major": 2,
192
+ "version_minor": 0
193
+ },
194
+ "text/plain": [
195
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
196
+ ]
197
+ },
198
+ "metadata": {},
199
+ "output_type": "display_data"
200
+ }
201
+ ],
202
+ "source": [
203
+ "from huggingface_hub import notebook_login\n",
204
+ "notebook_login()"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": 5,
210
+ "metadata": {},
211
+ "outputs": [
212
+ {
213
+ "name": "stdout",
214
+ "output_type": "stream",
215
+ "text": [
216
+ "{'type': 'user', 'id': '67624d1b57e77fe6e0c87ae5', 'name': 'drewgenai', 'fullname': 'Drew DeMarco', 'email': '[email protected]', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/L6eLaZmCK4jqW3ZTLYIAR.png', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'newotken', 'role': 'write', 'createdAt': '2025-02-12T04:11:04.130Z'}}}\n"
217
+ ]
218
+ }
219
+ ],
220
+ "source": [
221
+ "from huggingface_hub import whoami\n",
222
+ "print(whoami())\n"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "code",
227
+ "execution_count": 6,
228
+ "metadata": {},
229
+ "outputs": [
230
+ {
231
+ "name": "stdout",
232
+ "output_type": "stream",
233
+ "text": [
234
+ "mkdir: cannot create directory ‘example_files’: File exists\n",
235
+ "mkdir: cannot create directory ‘output’: File exists\n"
236
+ ]
237
+ }
238
+ ],
239
+ "source": [
240
+ "!mkdir example_files\n",
241
+ "!mkdir output"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "execution_count": 7,
247
+ "metadata": {},
248
+ "outputs": [],
249
+ "source": [
250
+ "from langchain_community.document_loaders import DirectoryLoader\n",
251
+ "from langchain_community.document_loaders import PyMuPDFLoader\n",
252
+ "\n",
253
+ "path = \"example_files/\"\n",
254
+ "text_loader = DirectoryLoader(path, glob=\"*.pdf\", loader_cls=PyMuPDFLoader)"
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "markdown",
259
+ "metadata": {},
260
+ "source": [
261
+ "1️⃣ Header-Based Chunking (Title-Based Splitter)\n",
262
+ "Uses document structure to split on headings, section titles, or patterns.\n",
263
+ "Works well for structured documents with named assessments, numbered lists, or headers.\n",
264
+ "Example: If it detects Chronic Pain Adjustment Index (CPAI-10), it groups everything under that title.\n",
265
+ "2️⃣ Semantic Chunking (Text-Meaning Splitter)\n",
266
+ "Uses embeddings or sentence similarity to decide where to break chunks.\n",
267
+ "Prevents splitting mid-context if sentences are closely related.\n",
268
+ "Example: Groups all related pain-assessment questions into one chunk."
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": 8,
274
+ "metadata": {},
275
+ "outputs": [],
276
+ "source": [
277
+ "# from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
278
+ "\n",
279
+ "\n",
280
+ "# text_splitter = RecursiveCharacterTextSplitter(\n",
281
+ "# chunk_size = 200,\n",
282
+ "# chunk_overlap = 20,\n",
283
+ "# length_function = len\n",
284
+ "# )\n",
285
+ "\n",
286
+ "\n",
287
+ "### potentially use for lenth tokens later"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": null,
293
+ "metadata": {},
294
+ "outputs": [],
295
+ "source": []
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": 9,
300
+ "metadata": {},
301
+ "outputs": [],
302
+ "source": [
303
+ "# #Load documents with metadata\n",
304
+ "# all_documents = text_loader.load()\n",
305
+ "# documents_with_metadata = []"
306
+ ]
307
+ },
308
+ {
309
+ "cell_type": "code",
310
+ "execution_count": 10,
311
+ "metadata": {},
312
+ "outputs": [],
313
+ "source": [
314
+ "# for doc in all_documents:\n",
315
+ "# # Extract document name (assuming PyMuPDFLoader stores the file name in metadata)\n",
316
+ "# source_name = doc.metadata.get(\"source\", \"unknown\")\n",
317
+ " \n",
318
+ "# # Split into chunks while preserving metadata\n",
319
+ "# chunks = text_splitter.split_documents([doc])\n",
320
+ "# for chunk in chunks:\n",
321
+ "# chunk.metadata[\"source\"] = source_name # Attach source info to each chunk\n",
322
+ "# documents_with_metadata.extend(chunks)"
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "markdown",
327
+ "metadata": {},
328
+ "source": [
329
+ "###testingbelow\n"
330
+ ]
331
+ },
332
+ {
333
+ "cell_type": "code",
334
+ "execution_count": 11,
335
+ "metadata": {},
336
+ "outputs": [],
337
+ "source": [
338
+ "#!pip install langchain_experimental"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": 12,
344
+ "metadata": {},
345
+ "outputs": [
346
+ {
347
+ "name": "stderr",
348
+ "output_type": "stream",
349
+ "text": [
350
+ "/tmp/ipykernel_456462/1110142159.py:7: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
351
+ " embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n"
352
+ ]
353
+ }
354
+ ],
355
+ "source": [
356
+ "from langchain_experimental.text_splitter import SemanticChunker\n",
357
+ "\n",
358
+ "from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
359
+ "\n",
360
+ "from langchain.embeddings import HuggingFaceEmbeddings\n",
361
+ "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
362
+ "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
363
+ "# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\n",
364
+ "# embedding_model = HuggingFaceEmbeddings(model_name=model_id, model_kwargs={\"trust_remote_code\": True})\n",
365
+ "\n",
366
+ "\n",
367
+ "semantic_splitter = SemanticChunker(embedding_model)\n",
368
+ "\n",
369
+ "all_documents = text_loader.load()\n",
370
+ "documents_with_metadata = []\n",
371
+ "\n"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "code",
376
+ "execution_count": 13,
377
+ "metadata": {},
378
+ "outputs": [],
379
+ "source": [
380
+ "#verify working\n",
381
+ "# test_doc = all_documents[0].page_content if all_documents else \"\"\n",
382
+ "# test_chunks = semantic_splitter.split_text(test_doc)\n",
383
+ "\n",
384
+ "# print(f\"\\n✅ Total Chunks for First Document: {len(test_chunks)}\")\n",
385
+ "# for i, chunk in enumerate(test_chunks[:3]): # Show first 3 chunks\n",
386
+ "# print(f\"\\n🔹 Chunk {i+1}: {chunk[:300]}\") # Print first 300 characters\n"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "code",
391
+ "execution_count": 14,
392
+ "metadata": {},
393
+ "outputs": [],
394
+ "source": [
395
+ "from langchain.schema import Document\n",
396
+ "\n",
397
+ "for doc in all_documents:\n",
398
+ " source_name = doc.metadata.get(\"source\", \"unknown\") # Get document source\n",
399
+ "\n",
400
+ " # Use SemanticChunker to intelligently split text\n",
401
+ " chunks = semantic_splitter.split_text(doc.page_content)\n",
402
+ "\n",
403
+ " # Convert chunks into LangChain Document format with metadata\n",
404
+ " for chunk in chunks:\n",
405
+ " doc_chunk = Document(page_content=chunk, metadata={\"source\": source_name})\n",
406
+ " documents_with_metadata.append(doc_chunk)"
407
+ ]
408
+ },
409
+ {
410
+ "cell_type": "markdown",
411
+ "metadata": {},
412
+ "source": [
413
+ "###testingabove"
414
+ ]
415
+ },
416
+ {
417
+ "cell_type": "code",
418
+ "execution_count": 15,
419
+ "metadata": {},
420
+ "outputs": [],
421
+ "source": [
422
+ "\n",
423
+ "#!pip install -qU huggingface_hub\n",
424
+ "#!pip install -qU ipywidgets\n"
425
+ ]
426
+ },
427
+ {
428
+ "cell_type": "code",
429
+ "execution_count": 16,
430
+ "metadata": {},
431
+ "outputs": [],
432
+ "source": [
433
+ "from sentence_transformers import SentenceTransformer\n",
434
+ "from langchain.vectorstores import Qdrant\n",
435
+ "from langchain.embeddings import HuggingFaceEmbeddings\n",
436
+ "\n",
437
+ "\n",
438
+ "# Load the SentenceTransformer model\n",
439
+ "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
440
+ "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
441
+ "\n",
442
+ "# Load documents into Qdrant\n",
443
+ "qdrant_vectorstore = Qdrant.from_documents(\n",
444
+ " documents_with_metadata,\n",
445
+ " embedding_model,\n",
446
+ " location=\":memory:\", # In-memory for testing\n",
447
+ " collection_name=\"document_comparison\",\n",
448
+ ")\n",
449
+ "\n",
450
+ "# Create a retriever\n",
451
+ "qdrant_retriever = qdrant_vectorstore.as_retriever()"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "code",
456
+ "execution_count": 63,
457
+ "metadata": {},
458
+ "outputs": [],
459
+ "source": [
460
+ "from langchain_core.prompts import ChatPromptTemplate\n",
461
+ "RAG_PROMPT = \"\"\"\n",
462
+ "CONTEXT:\n",
463
+ "{context}\n",
464
+ "\n",
465
+ "QUERY:\n",
466
+ "{question}\n",
467
+ "\n",
468
+ "You are a helpful assistant. Use the available context to answer the question.\n",
469
+ "\n",
470
+ "Return the response in **valid JSON format** with the following structure:\n",
471
+ "\n",
472
+ "[\n",
473
+ " {{\n",
474
+ " \"Derived Description\": \"A short name for the matched concept\",\n",
475
+ " \"Protocol_1\": \"Protocol 1 - Matching Element\",\n",
476
+ " \"Protocol_2\": \"Protocol 2 - Matching Element\"\n",
477
+ " }},\n",
478
+ " ...\n",
479
+ "]\n",
480
+ "\n",
481
+ "### Rules:\n",
482
+ "1. Only output **valid JSON** with no explanations, summaries, or markdown formatting.\n",
483
+ "2. Ensure each entry in the JSON list represents a single matched data element from the two protocols.\n",
484
+ "3. If no matching element is found in a protocol, leave it empty (\"\").\n",
485
+ "4. **Do NOT include headers, explanations, or additional formatting**—only return the raw JSON list.\n",
486
+ "5. It should include all the elements in the two protocols.\n",
487
+ "6. If it cannot match the element, create the row and include the protocol it did find and put \"could not match\" in the other protocol column.\n",
488
+ "7. protocol should be the between\n",
489
+ "\"\"\"\n",
490
+ "\n",
491
+ "rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)\n",
492
+ "\n",
493
+ "from langchain_openai import ChatOpenAI\n",
494
+ "\n",
495
+ "#openai_chat_model = ChatOpenAI(model=\"gpt-4o\")\n",
496
+ "openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
497
+ "\n",
498
+ "from operator import itemgetter\n",
499
+ "from langchain.schema.output_parser import StrOutputParser\n",
500
+ "\n",
501
+ "rag_chain = (\n",
502
+ " {\"context\": itemgetter(\"question\") | qdrant_retriever, \"question\": itemgetter(\"question\")}\n",
503
+ " | rag_prompt | openai_chat_model | StrOutputParser()\n",
504
+ ")"
505
+ ]
506
+ },
507
+ {
508
+ "cell_type": "code",
509
+ "execution_count": 64,
510
+ "metadata": {},
511
+ "outputs": [],
512
+ "source": [
513
+ "question_text = \"\"\"You are a helpful assistant. Use the available context to answer the question.\n",
514
+ "\n",
515
+ "Between these two files containing protocols, identify and match **entire assessment sections** based on conceptual similarity. Do NOT match individual questions.\n",
516
+ "\n",
517
+ "### **Output Format:**\n",
518
+ "Return the response in **valid JSON format** structured as a list of dictionaries, where each dictionary contains:\n",
519
+ "\n",
520
+ "[\n",
521
+ " {\n",
522
+ " \"Derived Description\": \"A short name describing the matched sections\",\n",
523
+ " \"Protocol_1\": \"Exact section heading from Protocol 1\",\n",
524
+ " \"Protocol_2\": \"Exact section heading from Protocol 2\"\n",
525
+ " }\n",
526
+ "]\n",
527
+ "\n",
528
+ "### **Matching Criteria:**\n",
529
+ "1. **Match entire assessment sections** based on their purpose and overall topic.\n",
530
+ "3. If a section in one protocol **has no match**, include it but leave the other protocol's field blank.\n",
531
+ "4. The **\"Derived Description\"** should be a **concise label** summarizing the section’s purpose, . It should describe the overall concept of the matched sections.\n",
532
+ "\n",
533
+ "### **Rules:**\n",
534
+ "1. **Only output valid JSON**—no explanations, summaries, or markdown formatting.\n",
535
+ "2. **Ensure each entry represents a single section-to-section match.**\n",
536
+ "4. **Prioritize conceptual similarity over exact wording** when aligning sections.\n",
537
+ "5. If no match is found, leave the unmatched protocol entry blank.\n",
538
+ "\n",
539
+ "### **Example Output:**\n",
540
+ "[\n",
541
+ " {\n",
542
+ " \"Derived Description\": \"Pain Coping Strategies\",\n",
543
+ " \"Protocol_1\": \"Pain Coping Strategy Scale (PCSS-9)\",\n",
544
+ " \"Protocol_2\": \"Chronic Pain Adjustment Index (CPAI-10)\"\n",
545
+ " },\n",
546
+ " {\n",
547
+ " \"Derived Description\": \"Work Stress and Fatigue\",\n",
548
+ " \"Protocol_1\": \"Work-Related Stress Scale (WRSS-8)\",\n",
549
+ " \"Protocol_2\": \"Occupational Fatigue Index (OFI-7)\"\n",
550
+ " },\n",
551
+ "]\n",
552
+ "\n",
553
+ "Do not add any additional text, explanations, or formatting—**only return the raw JSON list**.\n",
554
+ "\"\"\"\n",
555
+ "\n",
556
+ "\n",
557
+ "\n",
558
+ "# The questions within elements will be similar between the two documents and can be used to match the elements.\n",
559
+ "\n",
560
+ "# 1. Derived description from the two documents describing the index/measure/scale.\n",
561
+ "# 2. A column for each standard.\n",
562
+ "# 3. In the column for each name/version, the data element used to capture that description that will be the shortened item between ()\n",
563
+ "\n",
564
+ "# There should only be one row for each scale/index/etc.\n",
565
+ "# The description should not be one of the questions but a name that best describes the similar data elements.\"\"\"\n",
566
+ "\n",
567
+ "response_text = rag_chain.invoke({\"question\": question_text})\n",
568
+ "# response = rag_chain.invoke({\"question\": question_text})"
569
+ ]
570
+ },
571
+ {
572
+ "cell_type": "code",
573
+ "execution_count": 67,
574
+ "metadata": {},
575
+ "outputs": [],
576
+ "source": [
577
+ "import json\n",
578
+ "import pandas as pd\n",
579
+ "\n",
580
+ "def parse_rag_output(response_text):\n",
581
+ " \"\"\"Extract structured JSON data from the RAG response.\"\"\"\n",
582
+ " try:\n",
583
+ " structured_data = json.loads(response_text)\n",
584
+ "\n",
585
+ " # Ensure similarity score is always included\n",
586
+ " for item in structured_data:\n",
587
+ " item.setdefault(\"Similarity Score\", \"N/A\") # Default if missing\n",
588
+ "\n",
589
+ " return structured_data\n",
590
+ " except json.JSONDecodeError:\n",
591
+ " print(\"Error: Response is not valid JSON.\")\n",
592
+ " return None\n",
593
+ "\n",
594
+ "def save_to_csv(data, directory=\"./output\", filename=\"matching_data_elements.csv\"):\n",
595
+ " \"\"\"Save structured data to CSV.\"\"\"\n",
596
+ " if not data:\n",
597
+ " print(\"No data to save.\")\n",
598
+ " return\n",
599
+ "\n",
600
+ " file_path = os.path.join(directory, filename)\n",
601
+ " df = pd.DataFrame(data, columns=[\"Derived Description\", \"Protocol_1\", \"Protocol_2\"]) # Ensure correct columns\n",
602
+ " df.to_csv(file_path, index=False)\n",
603
+ " print(f\"✅ CSV file saved: {filename}\")\n",
604
+ "\n",
605
+ "# Run the pipeline\n",
606
+ "structured_output = parse_rag_output(response_text)\n",
607
+ "save_to_csv(structured_output)\n"
608
+ ]
609
+ },
610
+ {
611
+ "cell_type": "code",
612
+ "execution_count": null,
613
+ "metadata": {},
614
+ "outputs": [],
615
+ "source": []
616
+ },
617
+ {
618
+ "cell_type": "code",
619
+ "execution_count": 54,
620
+ "metadata": {},
621
+ "outputs": [
622
+ {
623
+ "data": {
624
+ "text/plain": [
625
+ "'[\\n {\\n \"Derived Description\": \"Memory Recall\",\\n \"Protocol_1_Name\": \"I struggle to remember names and faces. (Scale: 0-3)\",\\n \"Protocol_2_Name\": \"could not match\"\\n },\\n {\\n \"Derived Description\": \"Memory Retention\",\\n \"Protocol_1_Name\": \"I retain new information effectively.\",\\n \"Protocol_2_Name\": \"could not match\"\\n },\\n {\\n \"Derived Description\": \"Mnemonic Techniques\",\\n \"Protocol_1_Name\": \"I practice mnemonic techniques to improve recall.\",\\n \"Protocol_2_Name\": \"could not match\"\\n },\\n {\\n \"Derived Description\": \"Task Management Difficulty\",\\n \"Protocol_1_Name\": \"could not match\",\\n \"Protocol_2_Name\": \"I find it difficult to keep track of multiple responsibilities. (Scale: 0-3)\"\\n },\\n {\\n \"Derived Description\": \"Mental Fatigue in Problem-Solving\",\\n \"Protocol_1_Name\": \"could not match\",\\n \"Protocol_2_Name\": \"I get mentally fatigued quickly when problem-solving. (Scale: 0-3)\"\\n },\\n {\\n \"Derived Description\": \"Task Organization Techniques\",\\n \"Protocol_1_Name\": \"could not match\",\\n \"Protocol_2_Name\": \"I use structured techniques to organize my tasks. (Scale: 0-3)\"\\n }\\n]'"
626
+ ]
627
+ },
628
+ "execution_count": 54,
629
+ "metadata": {},
630
+ "output_type": "execute_result"
631
+ }
632
+ ],
633
+ "source": [
634
+ "# rag_chain.invoke({\"question\" : \"Based on the types of questions asked under each heading. can you identify the headings in one document that most closely match the second document. list them e.g paincoping/doc1 painstrategy/doc2\"})"
635
+ ]
636
+ },
637
+ {
638
+ "cell_type": "code",
639
+ "execution_count": 31,
640
+ "metadata": {},
641
+ "outputs": [
642
+ {
643
+ "data": {
644
+ "text/plain": [
645
+ "'[\\n {\\n \"Derived Description\": \"Memory Recall\",\\n \"Protocol_1_Name\": \"I struggle to remember names and faces.\",\\n \"Protocol_2_Name\": \"could not match\"\\n },\\n {\\n \"Derived Description\": \"Retaining Information\",\\n \"Protocol_1_Name\": \"I retain new information effectively.\",\\n \"Protocol_2_Name\": \"could not match\"\\n },\\n {\\n \"Derived Description\": \"Mnemonic Techniques\",\\n \"Protocol_1_Name\": \"could not match\",\\n \"Protocol_2_Name\": \"I practice mnemonic techniques to improve recall.\"\\n },\\n {\\n \"Derived Description\": \"Pain Management Preparation\",\\n \"Protocol_1_Name\": \"I mentally prepare myself before engaging in painful activities.\",\\n \"Protocol_2_Name\": \"could not match\"\\n },\\n {\\n \"Derived Description\": \"Pain Minimization Techniques\",\\n \"Protocol_1_Name\": \"I use relaxation techniques to minimize pain perception.\",\\n \"Protocol_2_Name\": \"could not match\"\\n },\\n {\\n \"Derived Description\": \"Breathing Exercises for Pain\",\\n \"Protocol_1_Name\": \"I use breathing exercises to manage pain episodes.\",\\n \"Protocol_2_Name\": \"could not match\"\\n },\\n {\\n \"Derived Description\": \"Avoiding Painful Activities\",\\n \"Protocol_1_Name\": \"I avoid specific physical activities that increase my pain.\",\\n \"Protocol_2_Name\": \"could not match\"\\n },\\n {\\n \"Derived Description\": \"Work Exhaustion\",\\n \"Protocol_1_Name\": \"I feel exhausted after a standard workday.\",\\n \"Protocol_2_Name\": \"could not match\"\\n },\\n {\\n \"Derived Description\": \"Motivation and Stress\",\\n \"Protocol_1_Name\": \"I struggle to stay motivated due to workplace stress.\",\\n \"Protocol_2_Name\": \"could not match\"\\n },\\n {\\n \"Derived Description\": \"Handling Multiple Responsibilities\",\\n \"Protocol_1_Name\": \"could not match\",\\n \"Protocol_2_Name\": \"I find it difficult to keep track of multiple responsibilities.\"\\n },\\n {\\n \"Derived Description\": \"Mental Fatigue from Problem-Solving\",\\n \"Protocol_1_Name\": \"could not match\",\\n \"Protocol_2_Name\": \"I get mentally fatigued quickly when problem-solving.\"\\n },\\n {\\n \"Derived Description\": \"Structured Task Organization\",\\n \"Protocol_1_Name\": \"could not match\",\\n \"Protocol_2_Name\": \"I use structured techniques to organize my tasks.\"\\n },\\n {\\n \"Derived Description\": \"Overwhelmed by Responsibilities\",\\n \"Protocol_1_Name\": \"I feel overwhelmed when handling multiple responsibilities.\",\\n \"Protocol_2_Name\": \"could not match\"\\n },\\n {\\n \"Derived Description\": \"Disconnecting from Work\",\\n \"Protocol_1_Name\": \"I find it difficult to disconnect from work-related concerns.\",\\n \"Protocol_2_Name\": \"could not match\"\\n },\\n {\\n \"Derived Description\": \"Sleep Disturbances from Work Stress\",\\n \"Protocol_1_Name\": \"I experience sleep disturbances due to work-related stress.\",\\n \"Protocol_2_Name\": \"could not match\"\\n }\\n]'"
646
+ ]
647
+ },
648
+ "execution_count": 31,
649
+ "metadata": {},
650
+ "output_type": "execute_result"
651
+ }
652
+ ],
653
+ "source": [
654
+ "# rag_chain.invoke({\"question\" : \"Based on the types of questions asked under each heading. can you identify the headings in one document that most closely match the second document. list them e.g paincoping/doc1 painstrategy/doc2. these are example headings not the ones in the actual documents. just list the matches not the rational. Can you list multiple matches?\"})"
655
+ ]
656
+ },
657
+ {
658
+ "cell_type": "code",
659
+ "execution_count": null,
660
+ "metadata": {},
661
+ "outputs": [],
662
+ "source": []
663
+ }
664
+ ],
665
+ "metadata": {
666
+ "kernelspec": {
667
+ "display_name": ".venv",
668
+ "language": "python",
669
+ "name": "python3"
670
+ },
671
+ "language_info": {
672
+ "codemirror_mode": {
673
+ "name": "ipython",
674
+ "version": 3
675
+ },
676
+ "file_extension": ".py",
677
+ "mimetype": "text/x-python",
678
+ "name": "python",
679
+ "nbconvert_exporter": "python",
680
+ "pygments_lexer": "ipython3",
681
+ "version": "3.13.1"
682
+ }
683
+ },
684
+ "nbformat": 4,
685
+ "nbformat_minor": 2
686
+ }
02-testembedtune copy.ipynb ADDED
@@ -0,0 +1,1282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 19,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# !pip install nest_asyncio \\\n",
10
+ "# langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters \\\n",
11
+ "# python-pptx==1.0.2 nltk==3.9.1 pymupdf lxml \\\n",
12
+ "# sentence-transformers IProgress \\\n",
13
+ "# huggingface_hub ipywidgets \\\n",
14
+ "# qdrant-client langchain_experimental\n",
15
+ "\n",
16
+ "# !pip install sentence_transformers datasets pyarrow\n",
17
+ "# !pip install torch\n",
18
+ "# !pip install accelerate>=0.26.0\n",
19
+ "# !pip install transformers\n",
20
+ "# !pip install wandb\n",
21
+ "\n"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 2,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "\n",
31
+ "import nest_asyncio\n",
32
+ "\n",
33
+ "nest_asyncio.apply()"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 3,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "#!pip install -qU langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 4,
48
+ "metadata": {},
49
+ "outputs": [],
50
+ "source": [
51
+ "#!pip install -qU faiss-cpu python-pptx==1.0.2 nltk==3.9.1 pymupdf beautifulsoup4 lxml"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": 5,
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "#!pip install -qU sentence-transformers\n",
61
+ "#!pip install -qU IProgress\n"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": 6,
67
+ "metadata": {},
68
+ "outputs": [],
69
+ "source": [
70
+ "import os\n",
71
+ "import getpass\n",
72
+ "\n",
73
+ "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter Your OpenAI API Key: \")"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 7,
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": [
82
+ "hf_username = getpass.getpass(\"Enter Your Hugging Face Username: \")\n"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": 8,
88
+ "metadata": {},
89
+ "outputs": [
90
+ {
91
+ "data": {
92
+ "application/vnd.jupyter.widget-view+json": {
93
+ "model_id": "df7fbe16b4c44797abc886b87583af59",
94
+ "version_major": 2,
95
+ "version_minor": 0
96
+ },
97
+ "text/plain": [
98
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
99
+ ]
100
+ },
101
+ "metadata": {},
102
+ "output_type": "display_data"
103
+ }
104
+ ],
105
+ "source": [
106
+ "from huggingface_hub import notebook_login\n",
107
+ "notebook_login()"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 9,
113
+ "metadata": {},
114
+ "outputs": [
115
+ {
116
+ "name": "stdout",
117
+ "output_type": "stream",
118
+ "text": [
119
+ "{'type': 'user', 'id': '67624d1b57e77fe6e0c87ae5', 'name': 'drewgenai', 'fullname': 'Drew DeMarco', 'email': '[email protected]', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/L6eLaZmCK4jqW3ZTLYIAR.png', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'newotken', 'role': 'write', 'createdAt': '2025-02-12T04:11:04.130Z'}}}\n"
120
+ ]
121
+ }
122
+ ],
123
+ "source": [
124
+ "from huggingface_hub import whoami\n",
125
+ "print(whoami())\n"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 10,
131
+ "metadata": {},
132
+ "outputs": [
133
+ {
134
+ "name": "stdout",
135
+ "output_type": "stream",
136
+ "text": [
137
+ "mkdir: cannot create directory ‘example_files’: File exists\n",
138
+ "mkdir: cannot create directory ‘output’: File exists\n"
139
+ ]
140
+ }
141
+ ],
142
+ "source": [
143
+ "!mkdir example_files\n",
144
+ "!mkdir output"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 11,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "from langchain_community.document_loaders import DirectoryLoader\n",
154
+ "from langchain_community.document_loaders import PyMuPDFLoader\n",
155
+ "\n",
156
+ "path = \"example_files/\"\n",
157
+ "text_loader = DirectoryLoader(path, glob=\"*.pdf\", loader_cls=PyMuPDFLoader)"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "markdown",
162
+ "metadata": {},
163
+ "source": [
164
+ "1️⃣ Header-Based Chunking (Title-Based Splitter)\n",
165
+ "Uses document structure to split on headings, section titles, or patterns.\n",
166
+ "Works well for structured documents with named assessments, numbered lists, or headers.\n",
167
+ "Example: If it detects Chronic Pain Adjustment Index (CPAI-10), it groups everything under that title.\n",
168
+ "2️⃣ Semantic Chunking (Text-Meaning Splitter)\n",
169
+ "Uses embeddings or sentence similarity to decide where to break chunks.\n",
170
+ "Prevents splitting mid-context if sentences are closely related.\n",
171
+ "Example: Groups all related pain-assessment questions into one chunk."
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": null,
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": []
180
+ },
181
+ {
182
+ "cell_type": "markdown",
183
+ "metadata": {},
184
+ "source": [
185
+ "###testingbelow\n"
186
+ ]
187
+ },
188
+ {
189
+ "cell_type": "code",
190
+ "execution_count": 12,
191
+ "metadata": {},
192
+ "outputs": [],
193
+ "source": [
194
+ "# !pip install langchain_experimental"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": 13,
200
+ "metadata": {},
201
+ "outputs": [],
202
+ "source": [
203
+ "\n",
204
+ "\n",
205
+ "# #might need to remove all together - don't think it's working\n",
206
+ "# !pip install --upgrade langchain langchain-experimental\n",
207
+ "# !pip install --upgrade langchain-community\n",
208
+ "# !pip install langchain langchain-experimental langchain-community\n",
209
+ "\n"
210
+ ]
211
+ },
212
+ {
213
+ "cell_type": "code",
214
+ "execution_count": 14,
215
+ "metadata": {},
216
+ "outputs": [
217
+ {
218
+ "name": "stderr",
219
+ "output_type": "stream",
220
+ "text": [
221
+ "/tmp/ipykernel_76652/2495904805.py:7: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n",
222
+ " embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n"
223
+ ]
224
+ }
225
+ ],
226
+ "source": [
227
+ "\n",
228
+ "\n",
229
+ "from langchain_experimental.text_splitter import SemanticChunker\n",
230
+ "\n",
231
+ "from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
232
+ "\n",
233
+ "from langchain.embeddings import HuggingFaceEmbeddings\n",
234
+ "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
235
+ "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
236
+ "\n",
237
+ "semantic_splitter = SemanticChunker(embedding_model)\n",
238
+ "\n",
239
+ "all_documents = text_loader.load()\n",
240
+ "documents_with_metadata = []\n",
241
+ "\n"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "execution_count": 15,
247
+ "metadata": {},
248
+ "outputs": [],
249
+ "source": [
250
+ "from langchain.schema import Document\n",
251
+ "\n",
252
+ "for doc in all_documents:\n",
253
+ " source_name = doc.metadata.get(\"source\", \"unknown\") # Get document source\n",
254
+ "\n",
255
+ " # Use SemanticChunker to intelligently split text\n",
256
+ " chunks = semantic_splitter.split_text(doc.page_content)\n",
257
+ "\n",
258
+ " # Convert chunks into LangChain Document format with metadata\n",
259
+ " for chunk in chunks:\n",
260
+ " doc_chunk = Document(page_content=chunk, metadata={\"source\": source_name})\n",
261
+ " documents_with_metadata.append(doc_chunk)"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "markdown",
266
+ "metadata": {},
267
+ "source": [
268
+ "##########################new testing below"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": 16,
274
+ "metadata": {},
275
+ "outputs": [],
276
+ "source": [
277
+ "#training_documents = text_loader.load()\n",
278
+ "training_documents = documents_with_metadata"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": 17,
284
+ "metadata": {},
285
+ "outputs": [],
286
+ "source": [
287
+ "import uuid\n",
288
+ "\n",
289
+ "id_set = set()\n",
290
+ "\n",
291
+ "for document in training_documents:\n",
292
+ " id = str(uuid.uuid4())\n",
293
+ " while id in id_set:\n",
294
+ " id = uuid.uuid4()\n",
295
+ " id_set.add(id)\n",
296
+ " document.metadata[\"id\"] = id"
297
+ ]
298
+ },
299
+ {
300
+ "cell_type": "code",
301
+ "execution_count": 18,
302
+ "metadata": {},
303
+ "outputs": [
304
+ {
305
+ "name": "stdout",
306
+ "output_type": "stream",
307
+ "text": [
308
+ "Training set: 4 docs\n",
309
+ "Validation set: 1 docs\n",
310
+ "Test set: 2 docs\n"
311
+ ]
312
+ }
313
+ ],
314
+ "source": [
315
+ "# Define split percentages\n",
316
+ "train_ratio = 0.7 # 70% training\n",
317
+ "val_ratio = 0.2 # 20% validation\n",
318
+ "test_ratio = 0.1 # 10% test\n",
319
+ "\n",
320
+ "# Calculate index breakpoints\n",
321
+ "total_docs = len(training_documents)\n",
322
+ "train_size = int(total_docs * train_ratio)\n",
323
+ "val_size = int(total_docs * val_ratio)\n",
324
+ "\n",
325
+ "# Perform the splits\n",
326
+ "training_split_documents = training_documents[:train_size]\n",
327
+ "val_split_documents = training_documents[train_size:train_size + val_size]\n",
328
+ "test_split_documents = training_documents[train_size + val_size:]\n",
329
+ "\n",
330
+ "# Print sizes to verify\n",
331
+ "print(f\"Training set: {len(training_split_documents)} docs\")\n",
332
+ "print(f\"Validation set: {len(val_split_documents)} docs\")\n",
333
+ "print(f\"Test set: {len(test_split_documents)} docs\")\n",
334
+ "\n",
335
+ "\n"
336
+ ]
337
+ },
338
+ {
339
+ "cell_type": "code",
340
+ "execution_count": 19,
341
+ "metadata": {},
342
+ "outputs": [],
343
+ "source": [
344
+ "from langchain_openai import ChatOpenAI\n",
345
+ "\n",
346
+ "qa_chat_model = ChatOpenAI(\n",
347
+ " model=\"gpt-4o-mini\",\n",
348
+ " temperature=0\n",
349
+ ")"
350
+ ]
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "execution_count": 22,
355
+ "metadata": {},
356
+ "outputs": [],
357
+ "source": [
358
+ "from langchain_core.prompts import ChatPromptTemplate\n",
359
+ "\n",
360
+ "qa_prompt = \"\"\"\\\n",
361
+ "Given the following context, you must generate questions based on only the provided context.\n",
362
+ "\n",
363
+ "You are to generate {n_questions} questions which should be provided in the following format:\n",
364
+ "\n",
365
+ "1. QUESTION #1\n",
366
+ "2. QUESTION #2\n",
367
+ "...\n",
368
+ "\n",
369
+ "Context:\n",
370
+ "{context}\n",
371
+ "\"\"\"\n",
372
+ "\n",
373
+ "qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)"
374
+ ]
375
+ },
376
+ {
377
+ "cell_type": "code",
378
+ "execution_count": 23,
379
+ "metadata": {},
380
+ "outputs": [],
381
+ "source": [
382
+ "question_generation_chain = qa_prompt_template | qa_chat_model"
383
+ ]
384
+ },
385
+ {
386
+ "cell_type": "code",
387
+ "execution_count": 24,
388
+ "metadata": {},
389
+ "outputs": [],
390
+ "source": [
391
+ "import asyncio\n",
392
+ "import uuid\n",
393
+ "from tqdm import tqdm\n",
394
+ "\n",
395
+ "async def process_document(document, n_questions):\n",
396
+ " questions_generated = await question_generation_chain.ainvoke({\"context\": document.page_content, \"n_questions\": n_questions})\n",
397
+ "\n",
398
+ " doc_questions = {}\n",
399
+ " doc_relevant_docs = {}\n",
400
+ "\n",
401
+ " for question in questions_generated.content.split(\"\\n\"):\n",
402
+ " question_id = str(uuid.uuid4())\n",
403
+ " doc_questions[question_id] = \"\".join(question.split(\".\")[1:]).strip()\n",
404
+ " doc_relevant_docs[question_id] = [document.metadata[\"id\"]]\n",
405
+ "\n",
406
+ " return doc_questions, doc_relevant_docs\n",
407
+ "\n",
408
+ "async def create_questions(documents, n_questions):\n",
409
+ " tasks = [process_document(doc, n_questions) for doc in documents]\n",
410
+ "\n",
411
+ " questions = {}\n",
412
+ " relevant_docs = {}\n",
413
+ "\n",
414
+ " for task in tqdm(asyncio.as_completed(tasks), total=len(documents), desc=\"Processing documents\"):\n",
415
+ " doc_questions, doc_relevant_docs = await task\n",
416
+ " questions.update(doc_questions)\n",
417
+ " relevant_docs.update(doc_relevant_docs)\n",
418
+ "\n",
419
+ " return questions, relevant_docs"
420
+ ]
421
+ },
422
+ {
423
+ "cell_type": "code",
424
+ "execution_count": 25,
425
+ "metadata": {},
426
+ "outputs": [
427
+ {
428
+ "name": "stderr",
429
+ "output_type": "stream",
430
+ "text": [
431
+ "Processing documents: 100%|██████████| 4/4 [00:01<00:00, 3.75it/s]\n",
432
+ "Processing documents: 100%|██████████| 1/1 [00:00<00:00, 1.21it/s]\n",
433
+ "Processing documents: 100%|██████████| 2/2 [00:01<00:00, 1.98it/s]\n"
434
+ ]
435
+ }
436
+ ],
437
+ "source": [
438
+ "training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)\n",
439
+ "val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)\n",
440
+ "test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)"
441
+ ]
442
+ },
443
+ {
444
+ "cell_type": "code",
445
+ "execution_count": 26,
446
+ "metadata": {},
447
+ "outputs": [],
448
+ "source": [
449
+ "import json\n",
450
+ "\n",
451
+ "training_corpus = {train_item.metadata[\"id\"] : train_item.page_content for train_item in training_split_documents}\n",
452
+ "\n",
453
+ "train_dataset = {\n",
454
+ " \"questions\" : training_questions,\n",
455
+ " \"relevant_contexts\" : training_relevant_contexts,\n",
456
+ " \"corpus\" : training_corpus\n",
457
+ "}\n",
458
+ "\n",
459
+ "with open(\"training_dataset.jsonl\", \"w\") as f:\n",
460
+ " json.dump(train_dataset, f)\n",
461
+ "\n",
462
+ "\n",
463
+ "val_corpus = {val_item.metadata[\"id\"] : val_item.page_content for val_item in val_split_documents}\n",
464
+ "\n",
465
+ "val_dataset = {\n",
466
+ " \"questions\" : val_questions,\n",
467
+ " \"relevant_contexts\" : val_relevant_contexts,\n",
468
+ " \"corpus\" : val_corpus\n",
469
+ "}\n",
470
+ "\n",
471
+ "with open(\"val_dataset.jsonl\", \"w\") as f:\n",
472
+ " json.dump(val_dataset, f)\n",
473
+ "\n",
474
+ "\n",
475
+ "train_corpus = {test_item.metadata[\"id\"] : test_item.page_content for test_item in test_split_documents}\n",
476
+ "\n",
477
+ "test_dataset = {\n",
478
+ " \"questions\" : test_questions,\n",
479
+ " \"relevant_contexts\" : test_relevant_contexts,\n",
480
+ " \"corpus\" : train_corpus\n",
481
+ "}\n",
482
+ "\n",
483
+ "with open(\"test_dataset.jsonl\", \"w\") as f:\n",
484
+ " json.dump(test_dataset, f)"
485
+ ]
486
+ },
487
+ {
488
+ "cell_type": "code",
489
+ "execution_count": 27,
490
+ "metadata": {},
491
+ "outputs": [],
492
+ "source": [
493
+ "# !pip install -qU sentence_transformers datasets pyarrow"
494
+ ]
495
+ },
496
+ {
497
+ "cell_type": "code",
498
+ "execution_count": 28,
499
+ "metadata": {},
500
+ "outputs": [],
501
+ "source": [
502
+ "from sentence_transformers import SentenceTransformer\n",
503
+ "\n",
504
+ "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
505
+ "model = SentenceTransformer(model_id)"
506
+ ]
507
+ },
508
+ {
509
+ "cell_type": "code",
510
+ "execution_count": 29,
511
+ "metadata": {},
512
+ "outputs": [],
513
+ "source": [
514
+ "from torch.utils.data import DataLoader\n",
515
+ "from torch.utils.data import Dataset\n",
516
+ "from sentence_transformers import InputExample"
517
+ ]
518
+ },
519
+ {
520
+ "cell_type": "code",
521
+ "execution_count": 30,
522
+ "metadata": {},
523
+ "outputs": [],
524
+ "source": [
525
+ "BATCH_SIZE = 10"
526
+ ]
527
+ },
528
+ {
529
+ "cell_type": "code",
530
+ "execution_count": 31,
531
+ "metadata": {},
532
+ "outputs": [],
533
+ "source": [
534
+ "corpus = train_dataset['corpus']\n",
535
+ "queries = train_dataset['questions']\n",
536
+ "relevant_docs = train_dataset['relevant_contexts']\n",
537
+ "\n",
538
+ "examples = []\n",
539
+ "for query_id, query in queries.items():\n",
540
+ " doc_id = relevant_docs[query_id][0]\n",
541
+ " text = corpus[doc_id]\n",
542
+ " example = InputExample(texts=[query, text])\n",
543
+ " examples.append(example)"
544
+ ]
545
+ },
546
+ {
547
+ "cell_type": "code",
548
+ "execution_count": 32,
549
+ "metadata": {},
550
+ "outputs": [],
551
+ "source": [
552
+ "loader = DataLoader(\n",
553
+ " examples, batch_size=BATCH_SIZE\n",
554
+ ")"
555
+ ]
556
+ },
557
+ {
558
+ "cell_type": "code",
559
+ "execution_count": 33,
560
+ "metadata": {},
561
+ "outputs": [],
562
+ "source": [
563
+ "from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss\n",
564
+ "\n",
565
+ "matryoshka_dimensions = [768, 512, 256, 128, 64]\n",
566
+ "inner_train_loss = MultipleNegativesRankingLoss(model)\n",
567
+ "train_loss = MatryoshkaLoss(\n",
568
+ " model, inner_train_loss, matryoshka_dims=matryoshka_dimensions\n",
569
+ ")"
570
+ ]
571
+ },
572
+ {
573
+ "cell_type": "code",
574
+ "execution_count": 34,
575
+ "metadata": {},
576
+ "outputs": [],
577
+ "source": [
578
+ "from sentence_transformers.evaluation import InformationRetrievalEvaluator\n",
579
+ "\n",
580
+ "corpus = val_dataset['corpus']\n",
581
+ "queries = val_dataset['questions']\n",
582
+ "relevant_docs = val_dataset['relevant_contexts']\n",
583
+ "\n",
584
+ "evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)"
585
+ ]
586
+ },
587
+ {
588
+ "cell_type": "code",
589
+ "execution_count": 35,
590
+ "metadata": {},
591
+ "outputs": [],
592
+ "source": [
593
+ "EPOCHS = 5"
594
+ ]
595
+ },
596
+ {
597
+ "cell_type": "code",
598
+ "execution_count": 36,
599
+ "metadata": {},
600
+ "outputs": [
601
+ {
602
+ "data": {
603
+ "text/html": [
604
+ "<button onClick=\"this.nextSibling.style.display='block';this.style.display='none';\">Display W&B run</button><iframe src='https://wandb.ai/dummy/dummy/runs/bel6hiln?jupyter=true' style='border:none;width:100%;height:420px;display:none;'></iframe>"
605
+ ],
606
+ "text/plain": [
607
+ "<wandb.sdk.wandb_run.Run at 0x72704850af90>"
608
+ ]
609
+ },
610
+ "execution_count": 36,
611
+ "metadata": {},
612
+ "output_type": "execute_result"
613
+ }
614
+ ],
615
+ "source": [
616
+ "#!pip install wandb\n",
617
+ "\n",
618
+ "import wandb\n",
619
+ "wandb.init(mode=\"disabled\")"
620
+ ]
621
+ },
622
+ {
623
+ "cell_type": "code",
624
+ "execution_count": 37,
625
+ "metadata": {},
626
+ "outputs": [],
627
+ "source": [
628
+ "# !pip install torch\n",
629
+ "# !pip install accelerate>=0.26.0\n",
630
+ "# !pip install transformers\n",
631
+ "\n"
632
+ ]
633
+ },
634
+ {
635
+ "cell_type": "code",
636
+ "execution_count": 38,
637
+ "metadata": {},
638
+ "outputs": [],
639
+ "source": [
640
+ "#!pip install --upgrade --force-reinstall transformers accelerate torch\n",
641
+ "#!which python\n",
642
+ "\n"
643
+ ]
644
+ },
645
+ {
646
+ "cell_type": "code",
647
+ "execution_count": 46,
648
+ "metadata": {},
649
+ "outputs": [
650
+ {
651
+ "data": {
652
+ "text/html": [
653
+ "\n",
654
+ " <div>\n",
655
+ " \n",
656
+ " <progress value='5' max='5' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
657
+ " [5/5 00:01, Epoch 5/5]\n",
658
+ " </div>\n",
659
+ " <table border=\"1\" class=\"dataframe\">\n",
660
+ " <thead>\n",
661
+ " <tr style=\"text-align: left;\">\n",
662
+ " <th>Step</th>\n",
663
+ " <th>Training Loss</th>\n",
664
+ " <th>Validation Loss</th>\n",
665
+ " <th>Cosine Accuracy@1</th>\n",
666
+ " <th>Cosine Accuracy@3</th>\n",
667
+ " <th>Cosine Accuracy@5</th>\n",
668
+ " <th>Cosine Accuracy@10</th>\n",
669
+ " <th>Cosine Precision@1</th>\n",
670
+ " <th>Cosine Precision@3</th>\n",
671
+ " <th>Cosine Precision@5</th>\n",
672
+ " <th>Cosine Precision@10</th>\n",
673
+ " <th>Cosine Recall@1</th>\n",
674
+ " <th>Cosine Recall@3</th>\n",
675
+ " <th>Cosine Recall@5</th>\n",
676
+ " <th>Cosine Recall@10</th>\n",
677
+ " <th>Cosine Ndcg@10</th>\n",
678
+ " <th>Cosine Mrr@10</th>\n",
679
+ " <th>Cosine Map@100</th>\n",
680
+ " </tr>\n",
681
+ " </thead>\n",
682
+ " <tbody>\n",
683
+ " <tr>\n",
684
+ " <td>1</td>\n",
685
+ " <td>No log</td>\n",
686
+ " <td>No log</td>\n",
687
+ " <td>1.000000</td>\n",
688
+ " <td>1.000000</td>\n",
689
+ " <td>1.000000</td>\n",
690
+ " <td>1.000000</td>\n",
691
+ " <td>1.000000</td>\n",
692
+ " <td>0.333333</td>\n",
693
+ " <td>0.200000</td>\n",
694
+ " <td>0.100000</td>\n",
695
+ " <td>1.000000</td>\n",
696
+ " <td>1.000000</td>\n",
697
+ " <td>1.000000</td>\n",
698
+ " <td>1.000000</td>\n",
699
+ " <td>1.000000</td>\n",
700
+ " <td>1.000000</td>\n",
701
+ " <td>1.000000</td>\n",
702
+ " </tr>\n",
703
+ " <tr>\n",
704
+ " <td>2</td>\n",
705
+ " <td>No log</td>\n",
706
+ " <td>No log</td>\n",
707
+ " <td>1.000000</td>\n",
708
+ " <td>1.000000</td>\n",
709
+ " <td>1.000000</td>\n",
710
+ " <td>1.000000</td>\n",
711
+ " <td>1.000000</td>\n",
712
+ " <td>0.333333</td>\n",
713
+ " <td>0.200000</td>\n",
714
+ " <td>0.100000</td>\n",
715
+ " <td>1.000000</td>\n",
716
+ " <td>1.000000</td>\n",
717
+ " <td>1.000000</td>\n",
718
+ " <td>1.000000</td>\n",
719
+ " <td>1.000000</td>\n",
720
+ " <td>1.000000</td>\n",
721
+ " <td>1.000000</td>\n",
722
+ " </tr>\n",
723
+ " <tr>\n",
724
+ " <td>3</td>\n",
725
+ " <td>No log</td>\n",
726
+ " <td>No log</td>\n",
727
+ " <td>1.000000</td>\n",
728
+ " <td>1.000000</td>\n",
729
+ " <td>1.000000</td>\n",
730
+ " <td>1.000000</td>\n",
731
+ " <td>1.000000</td>\n",
732
+ " <td>0.333333</td>\n",
733
+ " <td>0.200000</td>\n",
734
+ " <td>0.100000</td>\n",
735
+ " <td>1.000000</td>\n",
736
+ " <td>1.000000</td>\n",
737
+ " <td>1.000000</td>\n",
738
+ " <td>1.000000</td>\n",
739
+ " <td>1.000000</td>\n",
740
+ " <td>1.000000</td>\n",
741
+ " <td>1.000000</td>\n",
742
+ " </tr>\n",
743
+ " <tr>\n",
744
+ " <td>4</td>\n",
745
+ " <td>No log</td>\n",
746
+ " <td>No log</td>\n",
747
+ " <td>1.000000</td>\n",
748
+ " <td>1.000000</td>\n",
749
+ " <td>1.000000</td>\n",
750
+ " <td>1.000000</td>\n",
751
+ " <td>1.000000</td>\n",
752
+ " <td>0.333333</td>\n",
753
+ " <td>0.200000</td>\n",
754
+ " <td>0.100000</td>\n",
755
+ " <td>1.000000</td>\n",
756
+ " <td>1.000000</td>\n",
757
+ " <td>1.000000</td>\n",
758
+ " <td>1.000000</td>\n",
759
+ " <td>1.000000</td>\n",
760
+ " <td>1.000000</td>\n",
761
+ " <td>1.000000</td>\n",
762
+ " </tr>\n",
763
+ " <tr>\n",
764
+ " <td>5</td>\n",
765
+ " <td>No log</td>\n",
766
+ " <td>No log</td>\n",
767
+ " <td>1.000000</td>\n",
768
+ " <td>1.000000</td>\n",
769
+ " <td>1.000000</td>\n",
770
+ " <td>1.000000</td>\n",
771
+ " <td>1.000000</td>\n",
772
+ " <td>0.333333</td>\n",
773
+ " <td>0.200000</td>\n",
774
+ " <td>0.100000</td>\n",
775
+ " <td>1.000000</td>\n",
776
+ " <td>1.000000</td>\n",
777
+ " <td>1.000000</td>\n",
778
+ " <td>1.000000</td>\n",
779
+ " <td>1.000000</td>\n",
780
+ " <td>1.000000</td>\n",
781
+ " <td>1.000000</td>\n",
782
+ " </tr>\n",
783
+ " </tbody>\n",
784
+ "</table><p>"
785
+ ],
786
+ "text/plain": [
787
+ "<IPython.core.display.HTML object>"
788
+ ]
789
+ },
790
+ "metadata": {},
791
+ "output_type": "display_data"
792
+ }
793
+ ],
794
+ "source": [
795
+ "warmup_steps = int(len(loader) * EPOCHS * 0.1)\n",
796
+ "\n",
797
+ "model.fit(\n",
798
+ " train_objectives=[(loader, train_loss)],\n",
799
+ " epochs=EPOCHS,\n",
800
+ " warmup_steps=warmup_steps,\n",
801
+ " output_path='models/midterm-compare-arctic-embed-m-ft',\n",
802
+ " show_progress_bar=True,\n",
803
+ " evaluator=evaluator,\n",
804
+ " evaluation_steps=50\n",
805
+ ")"
806
+ ]
807
+ },
808
+ {
809
+ "cell_type": "code",
810
+ "execution_count": 47,
811
+ "metadata": {},
812
+ "outputs": [
813
+ {
814
+ "data": {
815
+ "application/vnd.jupyter.widget-view+json": {
816
+ "model_id": "c3832f15349447c59ef0b7950d732a59",
817
+ "version_major": 2,
818
+ "version_minor": 0
819
+ },
820
+ "text/plain": [
821
+ "model.safetensors: 0%| | 0.00/436M [00:00<?, ?B/s]"
822
+ ]
823
+ },
824
+ "metadata": {},
825
+ "output_type": "display_data"
826
+ },
827
+ {
828
+ "data": {
829
+ "text/plain": [
830
+ "'https://huggingface.co/drewgenai/midterm-compare-arctic-embed-m-ft/commit/695a90e0d9d4a6ca560a5844c0e5a7cf4c4c74a9'"
831
+ ]
832
+ },
833
+ "execution_count": 47,
834
+ "metadata": {},
835
+ "output_type": "execute_result"
836
+ }
837
+ ],
838
+ "source": [
839
+ "model.push_to_hub(f\"{hf_username}/midterm-compare-arctic-embed-m-ft\")"
840
+ ]
841
+ },
842
+ {
843
+ "cell_type": "code",
844
+ "execution_count": 48,
845
+ "metadata": {},
846
+ "outputs": [
847
+ {
848
+ "data": {
849
+ "application/vnd.jupyter.widget-view+json": {
850
+ "model_id": "5a84694a9cff451581d43a244cbd6ce5",
851
+ "version_major": 2,
852
+ "version_minor": 0
853
+ },
854
+ "text/plain": [
855
+ "modules.json: 0%| | 0.00/349 [00:00<?, ?B/s]"
856
+ ]
857
+ },
858
+ "metadata": {},
859
+ "output_type": "display_data"
860
+ },
861
+ {
862
+ "data": {
863
+ "application/vnd.jupyter.widget-view+json": {
864
+ "model_id": "d9635815ad784cc68833f2b4199c611b",
865
+ "version_major": 2,
866
+ "version_minor": 0
867
+ },
868
+ "text/plain": [
869
+ "config_sentence_transformers.json: 0%| | 0.00/281 [00:00<?, ?B/s]"
870
+ ]
871
+ },
872
+ "metadata": {},
873
+ "output_type": "display_data"
874
+ },
875
+ {
876
+ "data": {
877
+ "application/vnd.jupyter.widget-view+json": {
878
+ "model_id": "b425eef83f6c47cf90d9ad8df35bed07",
879
+ "version_major": 2,
880
+ "version_minor": 0
881
+ },
882
+ "text/plain": [
883
+ "README.md: 0%| | 0.00/26.3k [00:00<?, ?B/s]"
884
+ ]
885
+ },
886
+ "metadata": {},
887
+ "output_type": "display_data"
888
+ },
889
+ {
890
+ "data": {
891
+ "application/vnd.jupyter.widget-view+json": {
892
+ "model_id": "1c080b01bb4c43e3b0af3da190feff91",
893
+ "version_major": 2,
894
+ "version_minor": 0
895
+ },
896
+ "text/plain": [
897
+ "sentence_bert_config.json: 0%| | 0.00/53.0 [00:00<?, ?B/s]"
898
+ ]
899
+ },
900
+ "metadata": {},
901
+ "output_type": "display_data"
902
+ },
903
+ {
904
+ "data": {
905
+ "application/vnd.jupyter.widget-view+json": {
906
+ "model_id": "8ebbd4faaa99434fbd6413f24fadc8b1",
907
+ "version_major": 2,
908
+ "version_minor": 0
909
+ },
910
+ "text/plain": [
911
+ "config.json: 0%| | 0.00/675 [00:00<?, ?B/s]"
912
+ ]
913
+ },
914
+ "metadata": {},
915
+ "output_type": "display_data"
916
+ },
917
+ {
918
+ "data": {
919
+ "application/vnd.jupyter.widget-view+json": {
920
+ "model_id": "5ef43ded862f4e5685af4b66e51922af",
921
+ "version_major": 2,
922
+ "version_minor": 0
923
+ },
924
+ "text/plain": [
925
+ "model.safetensors: 0%| | 0.00/436M [00:00<?, ?B/s]"
926
+ ]
927
+ },
928
+ "metadata": {},
929
+ "output_type": "display_data"
930
+ },
931
+ {
932
+ "name": "stderr",
933
+ "output_type": "stream",
934
+ "text": [
935
+ "Some weights of BertModel were not initialized from the model checkpoint at drewgenai/midterm-compare-arctic-embed-m-ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']\n",
936
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
937
+ ]
938
+ },
939
+ {
940
+ "data": {
941
+ "application/vnd.jupyter.widget-view+json": {
942
+ "model_id": "f2704b3d8d214414acf54e23efb2de25",
943
+ "version_major": 2,
944
+ "version_minor": 0
945
+ },
946
+ "text/plain": [
947
+ "tokenizer_config.json: 0%| | 0.00/1.41k [00:00<?, ?B/s]"
948
+ ]
949
+ },
950
+ "metadata": {},
951
+ "output_type": "display_data"
952
+ },
953
+ {
954
+ "data": {
955
+ "application/vnd.jupyter.widget-view+json": {
956
+ "model_id": "70d0aca65df94b8c973d9e2aef700c6b",
957
+ "version_major": 2,
958
+ "version_minor": 0
959
+ },
960
+ "text/plain": [
961
+ "vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]"
962
+ ]
963
+ },
964
+ "metadata": {},
965
+ "output_type": "display_data"
966
+ },
967
+ {
968
+ "data": {
969
+ "application/vnd.jupyter.widget-view+json": {
970
+ "model_id": "b8a288bc2740416d8be044c1534138a0",
971
+ "version_major": 2,
972
+ "version_minor": 0
973
+ },
974
+ "text/plain": [
975
+ "tokenizer.json: 0%| | 0.00/712k [00:00<?, ?B/s]"
976
+ ]
977
+ },
978
+ "metadata": {},
979
+ "output_type": "display_data"
980
+ },
981
+ {
982
+ "data": {
983
+ "application/vnd.jupyter.widget-view+json": {
984
+ "model_id": "fd5494a1b2d2483884ccdfeaaf03e65c",
985
+ "version_major": 2,
986
+ "version_minor": 0
987
+ },
988
+ "text/plain": [
989
+ "special_tokens_map.json: 0%| | 0.00/695 [00:00<?, ?B/s]"
990
+ ]
991
+ },
992
+ "metadata": {},
993
+ "output_type": "display_data"
994
+ },
995
+ {
996
+ "data": {
997
+ "application/vnd.jupyter.widget-view+json": {
998
+ "model_id": "e6259269b65b45358940c42ac8e9d127",
999
+ "version_major": 2,
1000
+ "version_minor": 0
1001
+ },
1002
+ "text/plain": [
1003
+ "1_Pooling%2Fconfig.json: 0%| | 0.00/296 [00:00<?, ?B/s]"
1004
+ ]
1005
+ },
1006
+ "metadata": {},
1007
+ "output_type": "display_data"
1008
+ }
1009
+ ],
1010
+ "source": [
1011
+ "finetune_embeddings = HuggingFaceEmbeddings(model_name=f\"{hf_username}/midterm-compare-arctic-embed-m-ft\")"
1012
+ ]
1013
+ },
1014
+ {
1015
+ "cell_type": "markdown",
1016
+ "metadata": {},
1017
+ "source": [
1018
+ "###testingabove"
1019
+ ]
1020
+ },
1021
+ {
1022
+ "cell_type": "code",
1023
+ "execution_count": 33,
1024
+ "metadata": {},
1025
+ "outputs": [],
1026
+ "source": [
1027
+ "\n",
1028
+ "#!pip install -qU huggingface_hub\n",
1029
+ "#!pip install -qU ipywidgets\n"
1030
+ ]
1031
+ },
1032
+ {
1033
+ "cell_type": "code",
1034
+ "execution_count": 49,
1035
+ "metadata": {},
1036
+ "outputs": [
1037
+ {
1038
+ "name": "stderr",
1039
+ "output_type": "stream",
1040
+ "text": [
1041
+ "Some weights of BertModel were not initialized from the model checkpoint at drewgenai/demo-compare-arctic-embed-m-ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']\n",
1042
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
1043
+ ]
1044
+ }
1045
+ ],
1046
+ "source": [
1047
+ "from sentence_transformers import SentenceTransformer\n",
1048
+ "from langchain.vectorstores import Qdrant\n",
1049
+ "from langchain.embeddings import HuggingFaceEmbeddings\n",
1050
+ "\n",
1051
+ "\n",
1052
+ "# Load the SentenceTransformer model\n",
1053
+ "#model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
1054
+ "model_id = f\"{hf_username}/demo-compare-arctic-embed-m-ft\" \n",
1055
+ "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
1056
+ "# model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
1057
+ "# embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
1058
+ "# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\n",
1059
+ "# embedding_model = HuggingFaceEmbeddings(model_name=model_id, model_kwargs={\"trust_remote_code\": True})\n",
1060
+ "\n",
1061
+ "\n",
1062
+ "# Load documents into Qdrant\n",
1063
+ "qdrant_vectorstore = Qdrant.from_documents(\n",
1064
+ " documents_with_metadata,\n",
1065
+ " embedding_model,\n",
1066
+ " location=\":memory:\", # In-memory for testing\n",
1067
+ " collection_name=\"document_comparison\",\n",
1068
+ ")\n",
1069
+ "\n",
1070
+ "# Create a retriever\n",
1071
+ "qdrant_retriever = qdrant_vectorstore.as_retriever()"
1072
+ ]
1073
+ },
1074
+ {
1075
+ "cell_type": "code",
1076
+ "execution_count": 35,
1077
+ "metadata": {},
1078
+ "outputs": [],
1079
+ "source": [
1080
+ "# from langchain_core.prompts import ChatPromptTemplate\n",
1081
+ "\n",
1082
+ "# RAG_PROMPT = \"\"\"\n",
1083
+ "# CONTEXT:\n",
1084
+ "# {context}\n",
1085
+ "\n",
1086
+ "# QUERY:\n",
1087
+ "# {question}\n",
1088
+ "\n",
1089
+ "# You are a helpful assistant. Use the available context to answer the question. If you can't answer the question, say you don't know.\n",
1090
+ "# \"\"\"\n",
1091
+ "\n",
1092
+ "# rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)\n",
1093
+ "\n",
1094
+ "# from langchain_openai import ChatOpenAI\n",
1095
+ "\n",
1096
+ "# #openai_chat_model = ChatOpenAI(model=\"gpt-4o\")\n",
1097
+ "# openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
1098
+ "\n",
1099
+ "# from operator import itemgetter\n",
1100
+ "# from langchain.schema.output_parser import StrOutputParser\n",
1101
+ "\n",
1102
+ "# rag_chain = (\n",
1103
+ "# {\"context\": itemgetter(\"question\") | qdrant_retriever, \"question\": itemgetter(\"question\")}\n",
1104
+ "# | rag_prompt | openai_chat_model | StrOutputParser()\n",
1105
+ "# )"
1106
+ ]
1107
+ },
1108
+ {
1109
+ "cell_type": "code",
1110
+ "execution_count": 50,
1111
+ "metadata": {},
1112
+ "outputs": [],
1113
+ "source": [
1114
+ "from langchain_core.prompts import ChatPromptTemplate\n",
1115
+ "RAG_PROMPT = \"\"\"\n",
1116
+ "CONTEXT:\n",
1117
+ "{context}\n",
1118
+ "\n",
1119
+ "QUERY:\n",
1120
+ "{question}\n",
1121
+ "\n",
1122
+ "You are a helpful assistant. Use the available context to answer the question.\n",
1123
+ "\n",
1124
+ "Return the response in **valid JSON format** with the following structure:\n",
1125
+ "\n",
1126
+ "[\n",
1127
+ " {{\n",
1128
+ " \"Derived Description\": \"A short name for the matched concept\",\n",
1129
+ " \"Protocol_1_Name\": \"Protocol 1 - Matching Element\",\n",
1130
+ " \"Protocol_2_Name\": \"Protocol 2 - Matching Element\"\n",
1131
+ " }},\n",
1132
+ " ...\n",
1133
+ "]\n",
1134
+ "\n",
1135
+ "### Rules:\n",
1136
+ "1. Only output **valid JSON** with no explanations, summaries, or markdown formatting.\n",
1137
+ "2. Ensure each entry in the JSON list represents a single matched data element from the two protocols.\n",
1138
+ "3. If no matching element is found in a protocol, leave it empty (\"\").\n",
1139
+ "4. **Do NOT include headers, explanations, or additional formatting**—only return the raw JSON list.\n",
1140
+ "5. It should include all the elements in the two protocols.\n",
1141
+ "6. If it cannot match the element, create the row and include the protocol it did find and put \"could not match\" in the other protocol column.\n",
1142
+ "\"\"\"\n",
1143
+ "\n",
1144
+ "rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)\n",
1145
+ "\n",
1146
+ "from langchain_openai import ChatOpenAI\n",
1147
+ "\n",
1148
+ "#openai_chat_model = ChatOpenAI(model=\"gpt-4o\")\n",
1149
+ "openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
1150
+ "\n",
1151
+ "from operator import itemgetter\n",
1152
+ "from langchain.schema.output_parser import StrOutputParser\n",
1153
+ "\n",
1154
+ "rag_chain = (\n",
1155
+ " {\"context\": itemgetter(\"question\") | qdrant_retriever, \"question\": itemgetter(\"question\")}\n",
1156
+ " | rag_prompt | openai_chat_model | StrOutputParser()\n",
1157
+ ")"
1158
+ ]
1159
+ },
1160
+ {
1161
+ "cell_type": "code",
1162
+ "execution_count": 51,
1163
+ "metadata": {},
1164
+ "outputs": [],
1165
+ "source": [
1166
+ "question_text = \"\"\"Between these two files containing protocols, can you find the data elements in each that most likely match the element in the other and output a CSV that lists three columns:\n",
1167
+ "\n",
1168
+ "The questions within elements will be similar between the two documents and can be used to match the elements.\n",
1169
+ "\n",
1170
+ "1. Derived description from the two documents describing the index/measure/scale.\n",
1171
+ "2. A column for each standard.\n",
1172
+ "3. In the column for each name/version, the data element used to capture that description.\n",
1173
+ "\n",
1174
+ "There should only be one row for each scale/index/etc.\n",
1175
+ "The description should not be one of the questions but a name that best describes the similar data elements.\"\"\"\n",
1176
+ "\n",
1177
+ "response_text = rag_chain.invoke({\"question\": question_text})\n",
1178
+ "# response = rag_chain.invoke({\"question\": question_text})"
1179
+ ]
1180
+ },
1181
+ {
1182
+ "cell_type": "code",
1183
+ "execution_count": 52,
1184
+ "metadata": {},
1185
+ "outputs": [
1186
+ {
1187
+ "name": "stdout",
1188
+ "output_type": "stream",
1189
+ "text": [
1190
+ "✅ CSV file saved: matching_data_elements.csv\n"
1191
+ ]
1192
+ }
1193
+ ],
1194
+ "source": [
1195
+ "import json\n",
1196
+ "import pandas as pd\n",
1197
+ "\n",
1198
+ "def parse_rag_output(response_text):\n",
1199
+ " \"\"\"Extract structured JSON data from the RAG response.\"\"\"\n",
1200
+ " try:\n",
1201
+ " structured_data = json.loads(response_text)\n",
1202
+ "\n",
1203
+ " # Ensure similarity score is always included\n",
1204
+ " for item in structured_data:\n",
1205
+ " item.setdefault(\"Similarity Score\", \"N/A\") # Default if missing\n",
1206
+ "\n",
1207
+ " return structured_data\n",
1208
+ " except json.JSONDecodeError:\n",
1209
+ " print(\"Error: Response is not valid JSON.\")\n",
1210
+ " return None\n",
1211
+ "\n",
1212
+ "def save_to_csv(data, directory=\"./output\", filename=\"matching_data_elements.csv\"):\n",
1213
+ " \"\"\"Save structured data to CSV.\"\"\"\n",
1214
+ " if not data:\n",
1215
+ " print(\"No data to save.\")\n",
1216
+ " return\n",
1217
+ "\n",
1218
+ " file_path = os.path.join(directory, filename)\n",
1219
+ " df = pd.DataFrame(data, columns=[\"Derived Description\", \"Protocol_1_Name\", \"Protocol_2_Name\"]) # Ensure correct columns\n",
1220
+ " df.to_csv(file_path, index=False)\n",
1221
+ " print(f\"✅ CSV file saved: {filename}\")\n",
1222
+ "\n",
1223
+ "# Run the pipeline\n",
1224
+ "structured_output = parse_rag_output(response_text)\n",
1225
+ "save_to_csv(structured_output)\n"
1226
+ ]
1227
+ },
1228
+ {
1229
+ "cell_type": "code",
1230
+ "execution_count": null,
1231
+ "metadata": {},
1232
+ "outputs": [],
1233
+ "source": []
1234
+ },
1235
+ {
1236
+ "cell_type": "code",
1237
+ "execution_count": 40,
1238
+ "metadata": {},
1239
+ "outputs": [],
1240
+ "source": [
1241
+ "# rag_chain.invoke({\"question\" : \"Based on the types of questions asked under each heading. can you identify the headings in one document that most closely match the second document. list them e.g paincoping/doc1 painstrategy/doc2\"})"
1242
+ ]
1243
+ },
1244
+ {
1245
+ "cell_type": "code",
1246
+ "execution_count": 41,
1247
+ "metadata": {},
1248
+ "outputs": [],
1249
+ "source": [
1250
+ "# rag_chain.invoke({\"question\" : \"Based on the types of questions asked under each heading. can you identify the headings in one document that most closely match the second document. list them e.g paincoping/doc1 painstrategy/doc2. these are example headings not the ones in the actual documents. just list the matches not the rational. Can you list multiple matches?\"})"
1251
+ ]
1252
+ },
1253
+ {
1254
+ "cell_type": "code",
1255
+ "execution_count": null,
1256
+ "metadata": {},
1257
+ "outputs": [],
1258
+ "source": []
1259
+ }
1260
+ ],
1261
+ "metadata": {
1262
+ "kernelspec": {
1263
+ "display_name": ".venv",
1264
+ "language": "python",
1265
+ "name": "python3"
1266
+ },
1267
+ "language_info": {
1268
+ "codemirror_mode": {
1269
+ "name": "ipython",
1270
+ "version": 3
1271
+ },
1272
+ "file_extension": ".py",
1273
+ "mimetype": "text/x-python",
1274
+ "name": "python",
1275
+ "nbconvert_exporter": "python",
1276
+ "pygments_lexer": "ipython3",
1277
+ "version": "3.13.1"
1278
+ }
1279
+ },
1280
+ "nbformat": 4,
1281
+ "nbformat_minor": 2
1282
+ }
03-testembedtune.ipynb ADDED
@@ -0,0 +1,1861 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 19,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# !pip install nest_asyncio \\\n",
10
+ "# langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters \\\n",
11
+ "# python-pptx==1.0.2 nltk==3.9.1 pymupdf lxml \\\n",
12
+ "# sentence-transformers IProgress \\\n",
13
+ "# huggingface_hub ipywidgets \\\n",
14
+ "# qdrant-client langchain_experimental\n",
15
+ "\n",
16
+ "# !pip install sentence_transformers datasets pyarrow\n",
17
+ "# !pip install torch\n",
18
+ "# !pip install accelerate>=0.26.0\n",
19
+ "# !pip install transformers\n",
20
+ "# !pip install wandb\n",
21
+ "# !pip install ragas\n",
22
+ "\n"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 1,
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "\n",
32
+ "import nest_asyncio\n",
33
+ "\n",
34
+ "nest_asyncio.apply()"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": 3,
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "import os\n",
44
+ "import getpass\n",
45
+ "\n",
46
+ "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"Enter Your OpenAI API Key: \")\n",
47
+ "os.environ[\"RAGAS_APP_TOKEN\"] = getpass.getpass(\"Please enter your Ragas API key!\")"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 4,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "hf_username = getpass.getpass(\"Enter Your Hugging Face Username: \")\n"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 5,
62
+ "metadata": {},
63
+ "outputs": [
64
+ {
65
+ "data": {
66
+ "application/vnd.jupyter.widget-view+json": {
67
+ "model_id": "2098545c1f924b7c85f8b7ca809f6f1a",
68
+ "version_major": 2,
69
+ "version_minor": 0
70
+ },
71
+ "text/plain": [
72
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
73
+ ]
74
+ },
75
+ "metadata": {},
76
+ "output_type": "display_data"
77
+ },
78
+ {
79
+ "name": "stderr",
80
+ "output_type": "stream",
81
+ "text": [
82
+ "Token has not been saved to git credential helper.\n"
83
+ ]
84
+ }
85
+ ],
86
+ "source": [
87
+ "from huggingface_hub import notebook_login\n",
88
+ "notebook_login()"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": 6,
94
+ "metadata": {},
95
+ "outputs": [
96
+ {
97
+ "name": "stdout",
98
+ "output_type": "stream",
99
+ "text": [
100
+ "{'type': 'user', 'id': '67624d1b57e77fe6e0c87ae5', 'name': 'drewgenai', 'fullname': 'Drew DeMarco', 'email': '[email protected]', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/L6eLaZmCK4jqW3ZTLYIAR.png', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'newotken', 'role': 'write', 'createdAt': '2025-02-12T04:11:04.130Z'}}}\n"
101
+ ]
102
+ }
103
+ ],
104
+ "source": [
105
+ "from huggingface_hub import whoami\n",
106
+ "print(whoami())\n"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": 7,
112
+ "metadata": {},
113
+ "outputs": [
114
+ {
115
+ "name": "stdout",
116
+ "output_type": "stream",
117
+ "text": [
118
+ "mkdir: cannot create directory ‘example_files’: File exists\n",
119
+ "mkdir: cannot create directory ‘output’: File exists\n"
120
+ ]
121
+ }
122
+ ],
123
+ "source": [
124
+ "!mkdir example_files\n",
125
+ "!mkdir output"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 8,
131
+ "metadata": {},
132
+ "outputs": [],
133
+ "source": [
134
+ "from langchain_community.document_loaders import DirectoryLoader\n",
135
+ "from langchain_community.document_loaders import PyMuPDFLoader\n",
136
+ "\n",
137
+ "path = \"example_files/\"\n",
138
+ "text_loader = DirectoryLoader(path, glob=\"*.pdf\", loader_cls=PyMuPDFLoader)"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "markdown",
143
+ "metadata": {},
144
+ "source": [
145
+ "1️⃣ Header-Based Chunking (Title-Based Splitter)\n",
146
+ "Uses document structure to split on headings, section titles, or patterns.\n",
147
+ "Works well for structured documents with named assessments, numbered lists, or headers.\n",
148
+ "Example: If it detects Chronic Pain Adjustment Index (CPAI-10), it groups everything under that title.\n",
149
+ "2️⃣ Semantic Chunking (Text-Meaning Splitter)\n",
150
+ "Uses embeddings or sentence similarity to decide where to break chunks.\n",
151
+ "Prevents splitting mid-context if sentences are closely related.\n",
152
+ "Example: Groups all related pain-assessment questions into one chunk."
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": null,
158
+ "metadata": {},
159
+ "outputs": [],
160
+ "source": []
161
+ },
162
+ {
163
+ "cell_type": "markdown",
164
+ "metadata": {},
165
+ "source": [
166
+ "###testingbelow\n"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": 78,
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": [
175
+ "\n",
176
+ "\n",
177
+ "from langchain_experimental.text_splitter import SemanticChunker\n",
178
+ "\n",
179
+ "from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
180
+ "\n",
181
+ "from langchain.embeddings import HuggingFaceEmbeddings\n",
182
+ "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
183
+ "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
184
+ "\n",
185
+ "semantic_splitter = SemanticChunker(embedding_model)\n",
186
+ "\n",
187
+ "all_documents = text_loader.load()\n",
188
+ "documents_with_metadata = []\n",
189
+ "\n"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": 10,
195
+ "metadata": {},
196
+ "outputs": [],
197
+ "source": [
198
+ "from langchain.schema import Document\n",
199
+ "\n",
200
+ "for doc in all_documents:\n",
201
+ " source_name = doc.metadata.get(\"source\", \"unknown\") # Get document source\n",
202
+ "\n",
203
+ " # Use SemanticChunker to intelligently split text\n",
204
+ " chunks = semantic_splitter.split_text(doc.page_content)\n",
205
+ "\n",
206
+ " # Convert chunks into LangChain Document format with metadata\n",
207
+ " for chunk in chunks:\n",
208
+ " doc_chunk = Document(page_content=chunk, metadata={\"source\": source_name})\n",
209
+ " documents_with_metadata.append(doc_chunk)"
210
+ ]
211
+ },
212
+ {
213
+ "cell_type": "code",
214
+ "execution_count": null,
215
+ "metadata": {},
216
+ "outputs": [],
217
+ "source": []
218
+ },
219
+ {
220
+ "cell_type": "markdown",
221
+ "metadata": {},
222
+ "source": [
223
+ "##########################new testing below"
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "code",
228
+ "execution_count": 75,
229
+ "metadata": {},
230
+ "outputs": [],
231
+ "source": [
232
+ "#training_documents = text_loader.load()\n",
233
+ "### keeping documents_with_metadata and training_documents separate for now\n",
234
+ "\n",
235
+ "\n",
236
+ "from langchain.schema import Document\n",
237
+ "\n",
238
+ "training_documents = []\n",
239
+ "\n",
240
+ "\n",
241
+ "for doc in all_documents:\n",
242
+ " source_name = doc.metadata.get(\"source\", \"unknown\") # Get document source\n",
243
+ "\n",
244
+ " # Use SemanticChunker to intelligently split text\n",
245
+ " chunks = semantic_splitter.split_text(doc.page_content)\n",
246
+ "\n",
247
+ " # Convert chunks into LangChain Document format with metadata\n",
248
+ " for chunk in chunks:\n",
249
+ " doc_chunk = Document(page_content=chunk, metadata={\"source\": source_name})\n",
250
+ " training_documents.append(doc_chunk)\n",
251
+ "\n",
252
+ "\n",
253
+ "\n"
254
+ ]
255
+ },
256
+ {
257
+ "cell_type": "code",
258
+ "execution_count": 76,
259
+ "metadata": {},
260
+ "outputs": [],
261
+ "source": [
262
+ "import uuid\n",
263
+ "\n",
264
+ "id_set = set()\n",
265
+ "\n",
266
+ "for document in training_documents:\n",
267
+ " id = str(uuid.uuid4())\n",
268
+ " while id in id_set:\n",
269
+ " id = uuid.uuid4()\n",
270
+ " id_set.add(id)\n",
271
+ " document.metadata[\"id\"] = id"
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": 77,
277
+ "metadata": {},
278
+ "outputs": [
279
+ {
280
+ "name": "stdout",
281
+ "output_type": "stream",
282
+ "text": [
283
+ "Training set: 9 docs\n",
284
+ "Validation set: 2 docs\n",
285
+ "Test set: 3 docs\n"
286
+ ]
287
+ }
288
+ ],
289
+ "source": [
290
+ "# Define split percentages\n",
291
+ "train_ratio = 0.7 # 70% training\n",
292
+ "val_ratio = 0.2 # 20% validation\n",
293
+ "test_ratio = 0.1 # 10% test\n",
294
+ "\n",
295
+ "# Calculate index breakpoints\n",
296
+ "total_docs = len(training_documents)\n",
297
+ "train_size = int(total_docs * train_ratio)\n",
298
+ "val_size = int(total_docs * val_ratio)\n",
299
+ "\n",
300
+ "# Perform the splits\n",
301
+ "training_split_documents = training_documents[:train_size]\n",
302
+ "val_split_documents = training_documents[train_size:train_size + val_size]\n",
303
+ "test_split_documents = training_documents[train_size + val_size:]\n",
304
+ "\n",
305
+ "# Print sizes to verify\n",
306
+ "print(f\"Training set: {len(training_split_documents)} docs\")\n",
307
+ "print(f\"Validation set: {len(val_split_documents)} docs\")\n",
308
+ "print(f\"Test set: {len(test_split_documents)} docs\")\n",
309
+ "\n",
310
+ "\n"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": 44,
316
+ "metadata": {},
317
+ "outputs": [],
318
+ "source": [
319
+ "from langchain_openai import ChatOpenAI\n",
320
+ "\n",
321
+ "qa_chat_model = ChatOpenAI(\n",
322
+ " model=\"gpt-4o-mini\",\n",
323
+ " temperature=0\n",
324
+ ")"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "code",
329
+ "execution_count": 45,
330
+ "metadata": {},
331
+ "outputs": [],
332
+ "source": [
333
+ "from langchain_core.prompts import ChatPromptTemplate\n",
334
+ "\n",
335
+ "qa_prompt = \"\"\"\\\n",
336
+ "Given the following context, you must generate questions based on only the provided context.\n",
337
+ "\n",
338
+ "You are to generate {n_questions} questions which should be provided in the following format:\n",
339
+ "\n",
340
+ "1. QUESTION #1\n",
341
+ "2. QUESTION #2\n",
342
+ "...\n",
343
+ "\n",
344
+ "Context:\n",
345
+ "{context}\n",
346
+ "\"\"\"\n",
347
+ "\n",
348
+ "qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)"
349
+ ]
350
+ },
351
+ {
352
+ "cell_type": "code",
353
+ "execution_count": 46,
354
+ "metadata": {},
355
+ "outputs": [],
356
+ "source": [
357
+ "question_generation_chain = qa_prompt_template | qa_chat_model"
358
+ ]
359
+ },
360
+ {
361
+ "cell_type": "code",
362
+ "execution_count": 47,
363
+ "metadata": {},
364
+ "outputs": [],
365
+ "source": [
366
+ "import asyncio\n",
367
+ "import uuid\n",
368
+ "from tqdm import tqdm\n",
369
+ "\n",
370
+ "async def process_document(document, n_questions):\n",
371
+ " questions_generated = await question_generation_chain.ainvoke({\"context\": document.page_content, \"n_questions\": n_questions})\n",
372
+ "\n",
373
+ " doc_questions = {}\n",
374
+ " doc_relevant_docs = {}\n",
375
+ "\n",
376
+ " for question in questions_generated.content.split(\"\\n\"):\n",
377
+ " question_id = str(uuid.uuid4())\n",
378
+ " doc_questions[question_id] = \"\".join(question.split(\".\")[1:]).strip()\n",
379
+ " doc_relevant_docs[question_id] = [document.metadata[\"id\"]]\n",
380
+ "\n",
381
+ " return doc_questions, doc_relevant_docs\n",
382
+ "\n",
383
+ "async def create_questions(documents, n_questions):\n",
384
+ " tasks = [process_document(doc, n_questions) for doc in documents]\n",
385
+ "\n",
386
+ " questions = {}\n",
387
+ " relevant_docs = {}\n",
388
+ "\n",
389
+ " for task in tqdm(asyncio.as_completed(tasks), total=len(documents), desc=\"Processing documents\"):\n",
390
+ " doc_questions, doc_relevant_docs = await task\n",
391
+ " questions.update(doc_questions)\n",
392
+ " relevant_docs.update(doc_relevant_docs)\n",
393
+ "\n",
394
+ " return questions, relevant_docs"
395
+ ]
396
+ },
397
+ {
398
+ "cell_type": "code",
399
+ "execution_count": 48,
400
+ "metadata": {},
401
+ "outputs": [
402
+ {
403
+ "name": "stderr",
404
+ "output_type": "stream",
405
+ "text": [
406
+ "Processing documents: 100%|██████████| 9/9 [00:02<00:00, 4.44it/s]\n",
407
+ "Processing documents: 100%|██████████| 2/2 [00:01<00:00, 1.74it/s]\n",
408
+ "Processing documents: 100%|██████████| 3/3 [00:02<00:00, 1.50it/s]\n"
409
+ ]
410
+ }
411
+ ],
412
+ "source": [
413
+ "training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2)\n",
414
+ "val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2)\n",
415
+ "test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2)"
416
+ ]
417
+ },
418
+ {
419
+ "cell_type": "code",
420
+ "execution_count": 49,
421
+ "metadata": {},
422
+ "outputs": [],
423
+ "source": [
424
+ "import json\n",
425
+ "\n",
426
+ "training_corpus = {train_item.metadata[\"id\"] : train_item.page_content for train_item in training_split_documents}\n",
427
+ "\n",
428
+ "train_dataset = {\n",
429
+ " \"questions\" : training_questions,\n",
430
+ " \"relevant_contexts\" : training_relevant_contexts,\n",
431
+ " \"corpus\" : training_corpus\n",
432
+ "}\n",
433
+ "\n",
434
+ "with open(\"training_dataset.jsonl\", \"w\") as f:\n",
435
+ " json.dump(train_dataset, f)\n",
436
+ "\n",
437
+ "\n",
438
+ "val_corpus = {val_item.metadata[\"id\"] : val_item.page_content for val_item in val_split_documents}\n",
439
+ "\n",
440
+ "val_dataset = {\n",
441
+ " \"questions\" : val_questions,\n",
442
+ " \"relevant_contexts\" : val_relevant_contexts,\n",
443
+ " \"corpus\" : val_corpus\n",
444
+ "}\n",
445
+ "\n",
446
+ "with open(\"val_dataset.jsonl\", \"w\") as f:\n",
447
+ " json.dump(val_dataset, f)\n",
448
+ "\n",
449
+ "\n",
450
+ "train_corpus = {test_item.metadata[\"id\"] : test_item.page_content for test_item in test_split_documents}\n",
451
+ "\n",
452
+ "test_dataset = {\n",
453
+ " \"questions\" : test_questions,\n",
454
+ " \"relevant_contexts\" : test_relevant_contexts,\n",
455
+ " \"corpus\" : train_corpus\n",
456
+ "}\n",
457
+ "\n",
458
+ "with open(\"test_dataset.jsonl\", \"w\") as f:\n",
459
+ " json.dump(test_dataset, f)"
460
+ ]
461
+ },
462
+ {
463
+ "cell_type": "code",
464
+ "execution_count": 50,
465
+ "metadata": {},
466
+ "outputs": [],
467
+ "source": [
468
+ "# !pip install -qU sentence_transformers datasets pyarrow"
469
+ ]
470
+ },
471
+ {
472
+ "cell_type": "code",
473
+ "execution_count": 51,
474
+ "metadata": {},
475
+ "outputs": [],
476
+ "source": [
477
+ "from sentence_transformers import SentenceTransformer\n",
478
+ "\n",
479
+ "model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
480
+ "model = SentenceTransformer(model_id)"
481
+ ]
482
+ },
483
+ {
484
+ "cell_type": "code",
485
+ "execution_count": 52,
486
+ "metadata": {},
487
+ "outputs": [],
488
+ "source": [
489
+ "from torch.utils.data import DataLoader\n",
490
+ "from torch.utils.data import Dataset\n",
491
+ "from sentence_transformers import InputExample"
492
+ ]
493
+ },
494
+ {
495
+ "cell_type": "code",
496
+ "execution_count": 53,
497
+ "metadata": {},
498
+ "outputs": [],
499
+ "source": [
500
+ "BATCH_SIZE = 10"
501
+ ]
502
+ },
503
+ {
504
+ "cell_type": "code",
505
+ "execution_count": 54,
506
+ "metadata": {},
507
+ "outputs": [],
508
+ "source": [
509
+ "corpus = train_dataset['corpus']\n",
510
+ "queries = train_dataset['questions']\n",
511
+ "relevant_docs = train_dataset['relevant_contexts']\n",
512
+ "\n",
513
+ "examples = []\n",
514
+ "for query_id, query in queries.items():\n",
515
+ " doc_id = relevant_docs[query_id][0]\n",
516
+ " text = corpus[doc_id]\n",
517
+ " example = InputExample(texts=[query, text])\n",
518
+ " examples.append(example)"
519
+ ]
520
+ },
521
+ {
522
+ "cell_type": "code",
523
+ "execution_count": 55,
524
+ "metadata": {},
525
+ "outputs": [],
526
+ "source": [
527
+ "loader = DataLoader(\n",
528
+ " examples, batch_size=BATCH_SIZE\n",
529
+ ")"
530
+ ]
531
+ },
532
+ {
533
+ "cell_type": "code",
534
+ "execution_count": 56,
535
+ "metadata": {},
536
+ "outputs": [],
537
+ "source": [
538
+ "from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss\n",
539
+ "\n",
540
+ "matryoshka_dimensions = [768, 512, 256, 128, 64]\n",
541
+ "inner_train_loss = MultipleNegativesRankingLoss(model)\n",
542
+ "train_loss = MatryoshkaLoss(\n",
543
+ " model, inner_train_loss, matryoshka_dims=matryoshka_dimensions\n",
544
+ ")"
545
+ ]
546
+ },
547
+ {
548
+ "cell_type": "code",
549
+ "execution_count": 57,
550
+ "metadata": {},
551
+ "outputs": [],
552
+ "source": [
553
+ "from sentence_transformers.evaluation import InformationRetrievalEvaluator\n",
554
+ "\n",
555
+ "corpus = val_dataset['corpus']\n",
556
+ "queries = val_dataset['questions']\n",
557
+ "relevant_docs = val_dataset['relevant_contexts']\n",
558
+ "\n",
559
+ "evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)"
560
+ ]
561
+ },
562
+ {
563
+ "cell_type": "code",
564
+ "execution_count": 58,
565
+ "metadata": {},
566
+ "outputs": [],
567
+ "source": [
568
+ "EPOCHS = 5"
569
+ ]
570
+ },
571
+ {
572
+ "cell_type": "code",
573
+ "execution_count": 59,
574
+ "metadata": {},
575
+ "outputs": [
576
+ {
577
+ "data": {
578
+ "text/html": [
579
+ "<button onClick=\"this.nextSibling.style.display='block';this.style.display='none';\">Display W&B run</button><iframe src='https://wandb.ai/dummy/dummy/runs/3hjt799n?jupyter=true' style='border:none;width:100%;height:420px;display:none;'></iframe>"
580
+ ],
581
+ "text/plain": [
582
+ "<wandb.sdk.wandb_run.Run at 0x749b55325d10>"
583
+ ]
584
+ },
585
+ "execution_count": 59,
586
+ "metadata": {},
587
+ "output_type": "execute_result"
588
+ }
589
+ ],
590
+ "source": [
591
+ "#!pip install wandb\n",
592
+ "\n",
593
+ "import wandb\n",
594
+ "wandb.init(mode=\"disabled\")"
595
+ ]
596
+ },
597
+ {
598
+ "cell_type": "code",
599
+ "execution_count": 69,
600
+ "metadata": {},
601
+ "outputs": [
602
+ {
603
+ "data": {
604
+ "application/vnd.jupyter.widget-view+json": {
605
+ "model_id": "400bc1e49a854008a875534a9d3a50d4",
606
+ "version_major": 2,
607
+ "version_minor": 0
608
+ },
609
+ "text/plain": [
610
+ "Computing widget examples: 0%| | 0/1 [00:00<?, ?example/s]"
611
+ ]
612
+ },
613
+ "metadata": {},
614
+ "output_type": "display_data"
615
+ },
616
+ {
617
+ "name": "stderr",
618
+ "output_type": "stream",
619
+ "text": [
620
+ "\u001b[34m\u001b[1mwandb\u001b[0m: \u001b[33mWARNING\u001b[0m The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.\n"
621
+ ]
622
+ },
623
+ {
624
+ "data": {
625
+ "text/html": [
626
+ "\n",
627
+ " <div>\n",
628
+ " \n",
629
+ " <progress value='10' max='10' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
630
+ " [10/10 00:02, Epoch 5/5]\n",
631
+ " </div>\n",
632
+ " <table border=\"1\" class=\"dataframe\">\n",
633
+ " <thead>\n",
634
+ " <tr style=\"text-align: left;\">\n",
635
+ " <th>Step</th>\n",
636
+ " <th>Training Loss</th>\n",
637
+ " <th>Validation Loss</th>\n",
638
+ " <th>Cosine Accuracy@1</th>\n",
639
+ " <th>Cosine Accuracy@3</th>\n",
640
+ " <th>Cosine Accuracy@5</th>\n",
641
+ " <th>Cosine Accuracy@10</th>\n",
642
+ " <th>Cosine Precision@1</th>\n",
643
+ " <th>Cosine Precision@3</th>\n",
644
+ " <th>Cosine Precision@5</th>\n",
645
+ " <th>Cosine Precision@10</th>\n",
646
+ " <th>Cosine Recall@1</th>\n",
647
+ " <th>Cosine Recall@3</th>\n",
648
+ " <th>Cosine Recall@5</th>\n",
649
+ " <th>Cosine Recall@10</th>\n",
650
+ " <th>Cosine Ndcg@10</th>\n",
651
+ " <th>Cosine Mrr@10</th>\n",
652
+ " <th>Cosine Map@100</th>\n",
653
+ " </tr>\n",
654
+ " </thead>\n",
655
+ " <tbody>\n",
656
+ " <tr>\n",
657
+ " <td>2</td>\n",
658
+ " <td>No log</td>\n",
659
+ " <td>No log</td>\n",
660
+ " <td>0.750000</td>\n",
661
+ " <td>1.000000</td>\n",
662
+ " <td>1.000000</td>\n",
663
+ " <td>1.000000</td>\n",
664
+ " <td>0.750000</td>\n",
665
+ " <td>0.333333</td>\n",
666
+ " <td>0.200000</td>\n",
667
+ " <td>0.100000</td>\n",
668
+ " <td>0.750000</td>\n",
669
+ " <td>1.000000</td>\n",
670
+ " <td>1.000000</td>\n",
671
+ " <td>1.000000</td>\n",
672
+ " <td>0.907732</td>\n",
673
+ " <td>0.875000</td>\n",
674
+ " <td>0.875000</td>\n",
675
+ " </tr>\n",
676
+ " <tr>\n",
677
+ " <td>4</td>\n",
678
+ " <td>No log</td>\n",
679
+ " <td>No log</td>\n",
680
+ " <td>0.750000</td>\n",
681
+ " <td>1.000000</td>\n",
682
+ " <td>1.000000</td>\n",
683
+ " <td>1.000000</td>\n",
684
+ " <td>0.750000</td>\n",
685
+ " <td>0.333333</td>\n",
686
+ " <td>0.200000</td>\n",
687
+ " <td>0.100000</td>\n",
688
+ " <td>0.750000</td>\n",
689
+ " <td>1.000000</td>\n",
690
+ " <td>1.000000</td>\n",
691
+ " <td>1.000000</td>\n",
692
+ " <td>0.907732</td>\n",
693
+ " <td>0.875000</td>\n",
694
+ " <td>0.875000</td>\n",
695
+ " </tr>\n",
696
+ " <tr>\n",
697
+ " <td>6</td>\n",
698
+ " <td>No log</td>\n",
699
+ " <td>No log</td>\n",
700
+ " <td>0.750000</td>\n",
701
+ " <td>1.000000</td>\n",
702
+ " <td>1.000000</td>\n",
703
+ " <td>1.000000</td>\n",
704
+ " <td>0.750000</td>\n",
705
+ " <td>0.333333</td>\n",
706
+ " <td>0.200000</td>\n",
707
+ " <td>0.100000</td>\n",
708
+ " <td>0.750000</td>\n",
709
+ " <td>1.000000</td>\n",
710
+ " <td>1.000000</td>\n",
711
+ " <td>1.000000</td>\n",
712
+ " <td>0.907732</td>\n",
713
+ " <td>0.875000</td>\n",
714
+ " <td>0.875000</td>\n",
715
+ " </tr>\n",
716
+ " <tr>\n",
717
+ " <td>8</td>\n",
718
+ " <td>No log</td>\n",
719
+ " <td>No log</td>\n",
720
+ " <td>0.750000</td>\n",
721
+ " <td>1.000000</td>\n",
722
+ " <td>1.000000</td>\n",
723
+ " <td>1.000000</td>\n",
724
+ " <td>0.750000</td>\n",
725
+ " <td>0.333333</td>\n",
726
+ " <td>0.200000</td>\n",
727
+ " <td>0.100000</td>\n",
728
+ " <td>0.750000</td>\n",
729
+ " <td>1.000000</td>\n",
730
+ " <td>1.000000</td>\n",
731
+ " <td>1.000000</td>\n",
732
+ " <td>0.907732</td>\n",
733
+ " <td>0.875000</td>\n",
734
+ " <td>0.875000</td>\n",
735
+ " </tr>\n",
736
+ " <tr>\n",
737
+ " <td>10</td>\n",
738
+ " <td>No log</td>\n",
739
+ " <td>No log</td>\n",
740
+ " <td>0.750000</td>\n",
741
+ " <td>1.000000</td>\n",
742
+ " <td>1.000000</td>\n",
743
+ " <td>1.000000</td>\n",
744
+ " <td>0.750000</td>\n",
745
+ " <td>0.333333</td>\n",
746
+ " <td>0.200000</td>\n",
747
+ " <td>0.100000</td>\n",
748
+ " <td>0.750000</td>\n",
749
+ " <td>1.000000</td>\n",
750
+ " <td>1.000000</td>\n",
751
+ " <td>1.000000</td>\n",
752
+ " <td>0.907732</td>\n",
753
+ " <td>0.875000</td>\n",
754
+ " <td>0.875000</td>\n",
755
+ " </tr>\n",
756
+ " </tbody>\n",
757
+ "</table><p>"
758
+ ],
759
+ "text/plain": [
760
+ "<IPython.core.display.HTML object>"
761
+ ]
762
+ },
763
+ "metadata": {},
764
+ "output_type": "display_data"
765
+ }
766
+ ],
767
+ "source": [
768
+ "#commented out for now as want to run whole notebook but not retrain\n",
769
+ "# warmup_steps = int(len(loader) * EPOCHS * 0.1)\n",
770
+ "\n",
771
+ "# model.fit(\n",
772
+ "# train_objectives=[(loader, train_loss)],\n",
773
+ "# epochs=EPOCHS,\n",
774
+ "# warmup_steps=warmup_steps,\n",
775
+ "# output_path='models/midterm-compare-arctic-embed-m-ft',\n",
776
+ "# show_progress_bar=True,\n",
777
+ "# evaluator=evaluator,\n",
778
+ "# evaluation_steps=50\n",
779
+ "# )"
780
+ ]
781
+ },
782
+ {
783
+ "cell_type": "code",
784
+ "execution_count": 61,
785
+ "metadata": {},
786
+ "outputs": [],
787
+ "source": [
788
+ "#commented out for now as want to run whole notebook but not sending to hub\n",
789
+ "#model.push_to_hub(f\"{hf_username}/midterm-compare-arctic-embed-m-ft\")"
790
+ ]
791
+ },
792
+ {
793
+ "cell_type": "code",
794
+ "execution_count": 62,
795
+ "metadata": {},
796
+ "outputs": [
797
+ {
798
+ "name": "stderr",
799
+ "output_type": "stream",
800
+ "text": [
801
+ "Some weights of BertModel were not initialized from the model checkpoint at drewgenai/midterm-compare-arctic-embed-m-ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']\n",
802
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
803
+ ]
804
+ }
805
+ ],
806
+ "source": [
807
+ "finetune_embeddings = HuggingFaceEmbeddings(model_name=f\"{hf_username}/midterm-compare-arctic-embed-m-ft\")"
808
+ ]
809
+ },
810
+ {
811
+ "cell_type": "markdown",
812
+ "metadata": {},
813
+ "source": [
814
+ "###testingabove"
815
+ ]
816
+ },
817
+ {
818
+ "cell_type": "code",
819
+ "execution_count": 93,
820
+ "metadata": {},
821
+ "outputs": [
822
+ {
823
+ "name": "stderr",
824
+ "output_type": "stream",
825
+ "text": [
826
+ "Some weights of BertModel were not initialized from the model checkpoint at drewgenai/midterm-compare-arctic-embed-m-ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']\n",
827
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
828
+ ]
829
+ },
830
+ {
831
+ "ename": "IndexError",
832
+ "evalue": "list index out of range",
833
+ "output_type": "error",
834
+ "traceback": [
835
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
836
+ "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
837
+ "Cell \u001b[0;32mIn[93], line 17\u001b[0m\n\u001b[1;32m 9\u001b[0m embedding_model \u001b[38;5;241m=\u001b[39m HuggingFaceEmbeddings(model_name\u001b[38;5;241m=\u001b[39mmodel_id)\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# model_id = \"Snowflake/snowflake-arctic-embed-m\"\u001b[39;00m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# embedding_model = HuggingFaceEmbeddings(model_name=model_id)\u001b[39;00m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 15\u001b[0m \n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# Load documents into Qdrant\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m qdrant_vectorstore \u001b[38;5;241m=\u001b[39m \u001b[43mQdrant\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_documents\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 18\u001b[0m \u001b[43m \u001b[49m\u001b[43mdocuments_with_metadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 19\u001b[0m \u001b[43m \u001b[49m\u001b[43membedding_model\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m:memory:\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# In-memory for testing\u001b[39;49;00m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdocument_comparison\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;66;03m# Create a retriever\u001b[39;00m\n\u001b[1;32m 25\u001b[0m qdrant_retriever \u001b[38;5;241m=\u001b[39m qdrant_vectorstore\u001b[38;5;241m.\u001b[39mas_retriever()\n",
838
+ "File \u001b[0;32m~/Documents/huggingfacetesting/temptest/.venv/lib/python3.13/site-packages/langchain_core/vectorstores/base.py:852\u001b[0m, in \u001b[0;36mVectorStore.from_documents\u001b[0;34m(cls, documents, embedding, **kwargs)\u001b[0m\n\u001b[1;32m 849\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(ids):\n\u001b[1;32m 850\u001b[0m kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mids\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m ids\n\u001b[0;32m--> 852\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_texts\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtexts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43membedding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadatas\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadatas\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
839
+ "File \u001b[0;32m~/Documents/huggingfacetesting/temptest/.venv/lib/python3.13/site-packages/langchain_community/vectorstores/qdrant.py:1337\u001b[0m, in \u001b[0;36mQdrant.from_texts\u001b[0;34m(cls, texts, embedding, metadatas, ids, location, url, port, grpc_port, prefer_grpc, https, api_key, prefix, timeout, host, path, collection_name, distance_func, content_payload_key, metadata_payload_key, vector_name, batch_size, shard_number, replication_factor, write_consistency_factor, on_disk_payload, hnsw_config, optimizers_config, wal_config, quantization_config, init_from, on_disk, force_recreate, **kwargs)\u001b[0m\n\u001b[1;32m 1197\u001b[0m \u001b[38;5;129m@classmethod\u001b[39m\n\u001b[1;32m 1198\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfrom_texts\u001b[39m(\n\u001b[1;32m 1199\u001b[0m \u001b[38;5;28mcls\u001b[39m: Type[Qdrant],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1232\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m 1233\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Qdrant:\n\u001b[1;32m 1234\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Construct Qdrant wrapper from a list of texts.\u001b[39;00m\n\u001b[1;32m 1235\u001b[0m \n\u001b[1;32m 1236\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1335\u001b[0m \u001b[38;5;124;03m qdrant = Qdrant.from_texts(texts, embeddings, \"localhost\")\u001b[39;00m\n\u001b[1;32m 1336\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1337\u001b[0m qdrant \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconstruct_instance\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1338\u001b[0m \u001b[43m \u001b[49m\u001b[43mtexts\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1339\u001b[0m \u001b[43m \u001b[49m\u001b[43membedding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1340\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1341\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1342\u001b[0m \u001b[43m \u001b[49m\u001b[43mport\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1343\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrpc_port\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1344\u001b[0m \u001b[43m \u001b[49m\u001b[43mprefer_grpc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1345\u001b[0m \u001b[43m \u001b[49m\u001b[43mhttps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1346\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1347\u001b[0m \u001b[43m \u001b[49m\u001b[43mprefix\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1348\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1349\u001b[0m \u001b[43m \u001b[49m\u001b[43mhost\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1350\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1351\u001b[0m \u001b[43m \u001b[49m\u001b[43mcollection_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1352\u001b[0m \u001b[43m \u001b[49m\u001b[43mdistance_func\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1353\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontent_payload_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1354\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetadata_payload_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1355\u001b[0m \u001b[43m \u001b[49m\u001b[43mvector_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1356\u001b[0m \u001b[43m \u001b[49m\u001b[43mshard_number\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1357\u001b[0m \u001b[43m \u001b[49m\u001b[43mreplication_factor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1358\u001b[0m \u001b[43m \u001b[49m\u001b[43mwrite_consistency_factor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1359\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_disk_payload\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1360\u001b[0m \u001b[43m \u001b[49m\u001b[43mhnsw_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1361\u001b[0m \u001b[43m \u001b[49m\u001b[43moptimizers_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1362\u001b[0m \u001b[43m \u001b[49m\u001b[43mwal_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1363\u001b[0m \u001b[43m \u001b[49m\u001b[43mquantization_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1364\u001b[0m \u001b[43m \u001b[49m\u001b[43minit_from\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1365\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_disk\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1366\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_recreate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1367\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1368\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1369\u001b[0m qdrant\u001b[38;5;241m.\u001b[39madd_texts(texts, metadatas, ids, batch_size)\n\u001b[1;32m 1370\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m qdrant\n",
840
+ "File \u001b[0;32m~/Documents/huggingfacetesting/temptest/.venv/lib/python3.13/site-packages/langchain_community/vectorstores/qdrant.py:1640\u001b[0m, in \u001b[0;36mQdrant.construct_instance\u001b[0;34m(cls, texts, embedding, location, url, port, grpc_port, prefer_grpc, https, api_key, prefix, timeout, host, path, collection_name, distance_func, content_payload_key, metadata_payload_key, vector_name, shard_number, replication_factor, write_consistency_factor, on_disk_payload, hnsw_config, optimizers_config, wal_config, quantization_config, init_from, on_disk, force_recreate, **kwargs)\u001b[0m\n\u001b[1;32m 1638\u001b[0m \u001b[38;5;66;03m# Just do a single quick embedding to get vector size\u001b[39;00m\n\u001b[1;32m 1639\u001b[0m partial_embeddings \u001b[38;5;241m=\u001b[39m embedding\u001b[38;5;241m.\u001b[39membed_documents(texts[:\u001b[38;5;241m1\u001b[39m])\n\u001b[0;32m-> 1640\u001b[0m vector_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[43mpartial_embeddings\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m)\n\u001b[1;32m 1641\u001b[0m collection_name \u001b[38;5;241m=\u001b[39m collection_name \u001b[38;5;129;01mor\u001b[39;00m uuid\u001b[38;5;241m.\u001b[39muuid4()\u001b[38;5;241m.\u001b[39mhex\n\u001b[1;32m 1642\u001b[0m distance_func \u001b[38;5;241m=\u001b[39m distance_func\u001b[38;5;241m.\u001b[39mupper()\n",
841
+ "\u001b[0;31mIndexError\u001b[0m: list index out of range"
842
+ ]
843
+ }
844
+ ],
845
+ "source": [
846
+ "from sentence_transformers import SentenceTransformer\n",
847
+ "from langchain.vectorstores import Qdrant\n",
848
+ "from langchain.embeddings import HuggingFaceEmbeddings\n",
849
+ "\n",
850
+ "\n",
851
+ "# Load the SentenceTransformer model\n",
852
+ "#model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
853
+ "model_id = f\"{hf_username}/midterm-compare-arctic-embed-m-ft\" \n",
854
+ "embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
855
+ "# model_id = \"Snowflake/snowflake-arctic-embed-m\"\n",
856
+ "# embedding_model = HuggingFaceEmbeddings(model_name=model_id)\n",
857
+ "# model_id = \"Snowflake/snowflake-arctic-embed-m-v2.0\"\n",
858
+ "# embedding_model = HuggingFaceEmbeddings(model_name=model_id, model_kwargs={\"trust_remote_code\": True})\n",
859
+ "\n",
860
+ "\n",
861
+ "# Load documents into Qdrant\n",
862
+ "qdrant_vectorstore = Qdrant.from_documents(\n",
863
+ " documents_with_metadata,\n",
864
+ " embedding_model,\n",
865
+ " location=\":memory:\", # In-memory for testing\n",
866
+ " collection_name=\"document_comparison\",\n",
867
+ ")\n",
868
+ "\n",
869
+ "# Create a retriever\n",
870
+ "qdrant_retriever = qdrant_vectorstore.as_retriever()"
871
+ ]
872
+ },
873
+ {
874
+ "cell_type": "code",
875
+ "execution_count": 64,
876
+ "metadata": {},
877
+ "outputs": [],
878
+ "source": [
879
+ "from langchain_core.prompts import ChatPromptTemplate\n",
880
+ "RAG_PROMPT = \"\"\"\n",
881
+ "CONTEXT:\n",
882
+ "{context}\n",
883
+ "\n",
884
+ "QUERY:\n",
885
+ "{question}\n",
886
+ "\n",
887
+ "You are a helpful assistant. Use the available context to answer the question.\n",
888
+ "\n",
889
+ "Return the response in **valid JSON format** with the following structure:\n",
890
+ "\n",
891
+ "[\n",
892
+ " {{\n",
893
+ " \"Derived Description\": \"A short name for the matched concept\",\n",
894
+ " \"Protocol_1_Name\": \"Protocol 1 - Matching Element\",\n",
895
+ " \"Protocol_2_Name\": \"Protocol 2 - Matching Element\"\n",
896
+ " }},\n",
897
+ " ...\n",
898
+ "]\n",
899
+ "\n",
900
+ "### Rules:\n",
901
+ "1. Only output **valid JSON** with no explanations, summaries, or markdown formatting.\n",
902
+ "2. Ensure each entry in the JSON list represents a single matched data element from the two protocols.\n",
903
+ "3. If no matching element is found in a protocol, leave it empty (\"\").\n",
904
+ "4. **Do NOT include headers, explanations, or additional formatting**—only return the raw JSON list.\n",
905
+ "5. It should include all the elements in the two protocols.\n",
906
+ "6. If it cannot match the element, create the row and include the protocol it did find and put \"could not match\" in the other protocol column.\n",
907
+ "\"\"\"\n",
908
+ "\n",
909
+ "rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)\n",
910
+ "\n",
911
+ "from langchain_openai import ChatOpenAI\n",
912
+ "\n",
913
+ "#openai_chat_model = ChatOpenAI(model=\"gpt-4o\")\n",
914
+ "openai_chat_model = ChatOpenAI(model=\"gpt-4o-mini\")\n",
915
+ "\n",
916
+ "from operator import itemgetter\n",
917
+ "from langchain.schema.output_parser import StrOutputParser\n",
918
+ "\n",
919
+ "rag_chain = (\n",
920
+ " {\"context\": itemgetter(\"question\") | qdrant_retriever, \"question\": itemgetter(\"question\")}\n",
921
+ " | rag_prompt | openai_chat_model | StrOutputParser()\n",
922
+ ")"
923
+ ]
924
+ },
925
+ {
926
+ "cell_type": "code",
927
+ "execution_count": 65,
928
+ "metadata": {},
929
+ "outputs": [],
930
+ "source": [
931
+ "question_text = \"\"\"Between these two files containing protocols, can you find the data elements in each that most likely match the element in the other and output a CSV that lists three columns:\n",
932
+ "\n",
933
+ "The questions within elements will be similar between the two documents and can be used to match the elements.\n",
934
+ "\n",
935
+ "1. Derived description from the two documents describing the index/measure/scale.\n",
936
+ "2. A column for each standard.\n",
937
+ "3. In the column for each name/version, the data element used to capture that description.\n",
938
+ "\n",
939
+ "There should only be one row for each scale/index/etc.\n",
940
+ "The description should not be one of the questions but a name that best describes the similar data elements.\"\"\"\n",
941
+ "\n",
942
+ "response_text = rag_chain.invoke({\"question\": question_text})\n",
943
+ "# response = rag_chain.invoke({\"question\": question_text})"
944
+ ]
945
+ },
946
+ {
947
+ "cell_type": "code",
948
+ "execution_count": 66,
949
+ "metadata": {},
950
+ "outputs": [
951
+ {
952
+ "name": "stdout",
953
+ "output_type": "stream",
954
+ "text": [
955
+ "✅ CSV file saved: matching_data_elements.csv\n"
956
+ ]
957
+ }
958
+ ],
959
+ "source": [
960
+ "import json\n",
961
+ "import pandas as pd\n",
962
+ "\n",
963
+ "def parse_rag_output(response_text):\n",
964
+ " \"\"\"Extract structured JSON data from the RAG response.\"\"\"\n",
965
+ " try:\n",
966
+ " structured_data = json.loads(response_text)\n",
967
+ "\n",
968
+ " # Ensure similarity score is always included\n",
969
+ " for item in structured_data:\n",
970
+ " item.setdefault(\"Similarity Score\", \"N/A\") # Default if missing\n",
971
+ "\n",
972
+ " return structured_data\n",
973
+ " except json.JSONDecodeError:\n",
974
+ " print(\"Error: Response is not valid JSON.\")\n",
975
+ " return None\n",
976
+ "\n",
977
+ "def save_to_csv(data, directory=\"./output\", filename=\"matching_data_elements.csv\"):\n",
978
+ " \"\"\"Save structured data to CSV.\"\"\"\n",
979
+ " if not data:\n",
980
+ " print(\"No data to save.\")\n",
981
+ " return\n",
982
+ "\n",
983
+ " file_path = os.path.join(directory, filename)\n",
984
+ " df = pd.DataFrame(data, columns=[\"Derived Description\", \"Protocol_1_Name\", \"Protocol_2_Name\"]) # Ensure correct columns\n",
985
+ " df.to_csv(file_path, index=False)\n",
986
+ " print(f\"✅ CSV file saved: {filename}\")\n",
987
+ "\n",
988
+ "# Run the pipeline\n",
989
+ "structured_output = parse_rag_output(response_text)\n",
990
+ "save_to_csv(structured_output)\n"
991
+ ]
992
+ },
993
+ {
994
+ "cell_type": "code",
995
+ "execution_count": null,
996
+ "metadata": {},
997
+ "outputs": [],
998
+ "source": []
999
+ },
1000
+ {
1001
+ "cell_type": "code",
1002
+ "execution_count": 67,
1003
+ "metadata": {},
1004
+ "outputs": [],
1005
+ "source": [
1006
+ "# rag_chain.invoke({\"question\" : \"Based on the types of questions asked under each heading. can you identify the headings in one document that most closely match the second document. list them e.g paincoping/doc1 painstrategy/doc2\"})"
1007
+ ]
1008
+ },
1009
+ {
1010
+ "cell_type": "code",
1011
+ "execution_count": 68,
1012
+ "metadata": {},
1013
+ "outputs": [],
1014
+ "source": [
1015
+ "# rag_chain.invoke({\"question\" : \"Based on the types of questions asked under each heading. can you identify the headings in one document that most closely match the second document. list them e.g paincoping/doc1 painstrategy/doc2. these are example headings not the ones in the actual documents. just list the matches not the rational. Can you list multiple matches?\"})"
1016
+ ]
1017
+ },
1018
+ {
1019
+ "cell_type": "code",
1020
+ "execution_count": null,
1021
+ "metadata": {},
1022
+ "outputs": [],
1023
+ "source": []
1024
+ },
1025
+ {
1026
+ "cell_type": "code",
1027
+ "execution_count": null,
1028
+ "metadata": {},
1029
+ "outputs": [],
1030
+ "source": []
1031
+ },
1032
+ {
1033
+ "cell_type": "code",
1034
+ "execution_count": 96,
1035
+ "metadata": {},
1036
+ "outputs": [],
1037
+ "source": [
1038
+ "### ragas testing below\n",
1039
+ "#docs = documents_with_metadata\n",
1040
+ "docs = text_loader.load()"
1041
+ ]
1042
+ },
1043
+ {
1044
+ "cell_type": "code",
1045
+ "execution_count": 91,
1046
+ "metadata": {},
1047
+ "outputs": [],
1048
+ "source": [
1049
+ "from langchain_core.prompts import ChatPromptTemplate\n",
1050
+ "\n",
1051
+ "RAG_PROMPT = \"\"\"\\\n",
1052
+ "Given a provided context and a question, you must answer the question. If you do not know the answer, you must state that you do not know.\n",
1053
+ "\n",
1054
+ "Context:\n",
1055
+ "{context}\n",
1056
+ "\n",
1057
+ "Question:\n",
1058
+ "{question}\n",
1059
+ "\n",
1060
+ "Answer:\n",
1061
+ "\"\"\"\n",
1062
+ "\n",
1063
+ "rag_prompt_template = ChatPromptTemplate.from_template(RAG_PROMPT)"
1064
+ ]
1065
+ },
1066
+ {
1067
+ "cell_type": "code",
1068
+ "execution_count": 92,
1069
+ "metadata": {},
1070
+ "outputs": [],
1071
+ "source": [
1072
+ "rag_llm = ChatOpenAI(\n",
1073
+ " model=\"gpt-4o-mini\",\n",
1074
+ " temperature=0\n",
1075
+ ")"
1076
+ ]
1077
+ },
1078
+ {
1079
+ "cell_type": "code",
1080
+ "execution_count": null,
1081
+ "metadata": {},
1082
+ "outputs": [],
1083
+ "source": []
1084
+ },
1085
+ {
1086
+ "cell_type": "code",
1087
+ "execution_count": 113,
1088
+ "metadata": {},
1089
+ "outputs": [
1090
+ {
1091
+ "name": "stderr",
1092
+ "output_type": "stream",
1093
+ "text": [
1094
+ "Some weights of BertModel were not initialized from the model checkpoint at drewgenai/midterm-compare-arctic-embed-m-ft and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']\n",
1095
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
1096
+ ]
1097
+ }
1098
+ ],
1099
+ "source": [
1100
+ "base_model_id = f\"Snowflake/snowflake-arctic-embed-m\" \n",
1101
+ "base_embedding_model = HuggingFaceEmbeddings(model_name=base_model_id)\n",
1102
+ "\n",
1103
+ "finetune_model_id = f\"{hf_username}/midterm-compare-arctic-embed-m-ft\" \n",
1104
+ "finetune_embedding_model = HuggingFaceEmbeddings(model_name=finetune_model_id)\n",
1105
+ "\n",
1106
+ "openai_model_id = \"text-embedding-3-small\"\n",
1107
+ "openai_embedding_model = OpenAIEmbeddings(model=openai_model_id)\n"
1108
+ ]
1109
+ },
1110
+ {
1111
+ "cell_type": "code",
1112
+ "execution_count": 114,
1113
+ "metadata": {},
1114
+ "outputs": [],
1115
+ "source": [
1116
+ "#from langchain_community.vectorstores import FAISS\n",
1117
+ "\n",
1118
+ "### try qdrant?\n",
1119
+ "\n",
1120
+ "qdrant_vectorstore_base = Qdrant.from_documents(\n",
1121
+ " docs,\n",
1122
+ " base_embedding_model,\n",
1123
+ " location=\":memory:\", # In-memory for testing\n",
1124
+ " collection_name=\"document_comparison\",\n",
1125
+ ")\n",
1126
+ "\n",
1127
+ "\n",
1128
+ "base_retriever = qdrant_vectorstore_base.as_retriever(search_kwargs={\"k\": 6})\n",
1129
+ "\n",
1130
+ "qdrant_vectorstore_finetune = Qdrant.from_documents(\n",
1131
+ " docs,\n",
1132
+ " finetune_embedding_model,\n",
1133
+ " location=\":memory:\", # In-memory for testing\n",
1134
+ " collection_name=\"document_comparison\",\n",
1135
+ ")\n",
1136
+ "\n",
1137
+ "\n",
1138
+ "finetune_retriever = qdrant_vectorstore_finetune.as_retriever(search_kwargs={\"k\": 6})\n",
1139
+ "\n",
1140
+ "\n",
1141
+ "\n",
1142
+ "qdrant_vectorstore_openai = Qdrant.from_documents(\n",
1143
+ " docs,\n",
1144
+ " openai_embedding_model,\n",
1145
+ " location=\":memory:\", # In-memory for testing\n",
1146
+ " collection_name=\"document_comparison\",\n",
1147
+ ")\n",
1148
+ "\n",
1149
+ "\n",
1150
+ "openai_retriever = qdrant_vectorstore_openai.as_retriever(search_kwargs={\"k\": 6})\n"
1151
+ ]
1152
+ },
1153
+ {
1154
+ "cell_type": "code",
1155
+ "execution_count": null,
1156
+ "metadata": {},
1157
+ "outputs": [],
1158
+ "source": [
1159
+ "\n",
1160
+ "# # Create a retriever\n",
1161
+ "# qdrant_retriever = qdrant_vectorstore.as_retriever()\n",
1162
+ "\n",
1163
+ "\n",
1164
+ "\n",
1165
+ "\n",
1166
+ "\n",
1167
+ "# ###\n",
1168
+ "\n",
1169
+ "# base_vectorstore = FAISS.from_documents(training_documents, base_embedding_model)\n",
1170
+ "# base_retriever = base_vectorstore.as_retriever(search_kwargs={\"k\": 6})"
1171
+ ]
1172
+ },
1173
+ {
1174
+ "cell_type": "code",
1175
+ "execution_count": 100,
1176
+ "metadata": {},
1177
+ "outputs": [],
1178
+ "source": [
1179
+ "from langchain.schema.runnable import RunnablePassthrough\n",
1180
+ "\n",
1181
+ "base_rag_chain = (\n",
1182
+ " {\"context\": itemgetter(\"question\") | base_retriever, \"question\": itemgetter(\"question\")}\n",
1183
+ " | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n",
1184
+ " | {\"response\": rag_prompt_template | rag_llm | StrOutputParser(), \"context\": itemgetter(\"context\")}\n",
1185
+ ")"
1186
+ ]
1187
+ },
1188
+ {
1189
+ "cell_type": "code",
1190
+ "execution_count": 102,
1191
+ "metadata": {},
1192
+ "outputs": [],
1193
+ "source": [
1194
+ "finetune_rag_chain = (\n",
1195
+ " {\"context\": itemgetter(\"question\") | finetune_retriever, \"question\": itemgetter(\"question\")}\n",
1196
+ " | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n",
1197
+ " | {\"response\": rag_prompt_template | rag_llm | StrOutputParser(), \"context\": itemgetter(\"context\")}\n",
1198
+ ")"
1199
+ ]
1200
+ },
1201
+ {
1202
+ "cell_type": "code",
1203
+ "execution_count": 115,
1204
+ "metadata": {},
1205
+ "outputs": [],
1206
+ "source": [
1207
+ "from langchain.schema.runnable import RunnablePassthrough\n",
1208
+ "\n",
1209
+ "openai_rag_chain = (\n",
1210
+ " {\"context\": itemgetter(\"question\") | openai_retriever, \"question\": itemgetter(\"question\")}\n",
1211
+ " | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n",
1212
+ " | {\"response\": rag_prompt_template | rag_llm | StrOutputParser(), \"context\": itemgetter(\"context\")}\n",
1213
+ ")"
1214
+ ]
1215
+ },
1216
+ {
1217
+ "cell_type": "code",
1218
+ "execution_count": 87,
1219
+ "metadata": {},
1220
+ "outputs": [],
1221
+ "source": [
1222
+ "\n"
1223
+ ]
1224
+ },
1225
+ {
1226
+ "cell_type": "code",
1227
+ "execution_count": 103,
1228
+ "metadata": {},
1229
+ "outputs": [],
1230
+ "source": [
1231
+ "from ragas.llms import LangchainLLMWrapper\n",
1232
+ "from ragas.embeddings import LangchainEmbeddingsWrapper\n",
1233
+ "from langchain_openai import ChatOpenAI\n",
1234
+ "from langchain_openai import OpenAIEmbeddings\n",
1235
+ "generator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))\n",
1236
+ "generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())"
1237
+ ]
1238
+ },
1239
+ {
1240
+ "cell_type": "code",
1241
+ "execution_count": 104,
1242
+ "metadata": {},
1243
+ "outputs": [
1244
+ {
1245
+ "data": {
1246
+ "application/vnd.jupyter.widget-view+json": {
1247
+ "model_id": "7c3166b3cd08451a9b2d35c0b73581af",
1248
+ "version_major": 2,
1249
+ "version_minor": 0
1250
+ },
1251
+ "text/plain": [
1252
+ "Applying SummaryExtractor: 0%| | 0/6 [00:00<?, ?it/s]"
1253
+ ]
1254
+ },
1255
+ "metadata": {},
1256
+ "output_type": "display_data"
1257
+ },
1258
+ {
1259
+ "data": {
1260
+ "application/vnd.jupyter.widget-view+json": {
1261
+ "model_id": "84fc7afd0ff04c0e8990cb88b9978867",
1262
+ "version_major": 2,
1263
+ "version_minor": 0
1264
+ },
1265
+ "text/plain": [
1266
+ "Applying CustomNodeFilter: 0%| | 0/7 [00:00<?, ?it/s]"
1267
+ ]
1268
+ },
1269
+ "metadata": {},
1270
+ "output_type": "display_data"
1271
+ },
1272
+ {
1273
+ "name": "stderr",
1274
+ "output_type": "stream",
1275
+ "text": [
1276
+ "Node 77fa3fd5-0ec7-4864-8a9f-fb6df33f64ec does not have a summary. Skipping filtering.\n"
1277
+ ]
1278
+ },
1279
+ {
1280
+ "data": {
1281
+ "application/vnd.jupyter.widget-view+json": {
1282
+ "model_id": "8e6bcaf303d641fa8c48f3dd8f077771",
1283
+ "version_major": 2,
1284
+ "version_minor": 0
1285
+ },
1286
+ "text/plain": [
1287
+ "Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]: 0%| | 0/20 [00:00<?, ?it/s]"
1288
+ ]
1289
+ },
1290
+ "metadata": {},
1291
+ "output_type": "display_data"
1292
+ },
1293
+ {
1294
+ "data": {
1295
+ "application/vnd.jupyter.widget-view+json": {
1296
+ "model_id": "4146d76a8f93496d909b6f56f2b99644",
1297
+ "version_major": 2,
1298
+ "version_minor": 0
1299
+ },
1300
+ "text/plain": [
1301
+ "Applying OverlapScoreBuilder: 0%| | 0/1 [00:00<?, ?it/s]"
1302
+ ]
1303
+ },
1304
+ "metadata": {},
1305
+ "output_type": "display_data"
1306
+ },
1307
+ {
1308
+ "data": {
1309
+ "application/vnd.jupyter.widget-view+json": {
1310
+ "model_id": "7bf10ce73bf04cdf9c8bb81d5134095f",
1311
+ "version_major": 2,
1312
+ "version_minor": 0
1313
+ },
1314
+ "text/plain": [
1315
+ "Generating personas: 0%| | 0/3 [00:00<?, ?it/s]"
1316
+ ]
1317
+ },
1318
+ "metadata": {},
1319
+ "output_type": "display_data"
1320
+ },
1321
+ {
1322
+ "data": {
1323
+ "application/vnd.jupyter.widget-view+json": {
1324
+ "model_id": "8c5a3b61bcb94ab19b0478a95b1b43ad",
1325
+ "version_major": 2,
1326
+ "version_minor": 0
1327
+ },
1328
+ "text/plain": [
1329
+ "Generating Scenarios: 0%| | 0/1 [00:00<?, ?it/s]"
1330
+ ]
1331
+ },
1332
+ "metadata": {},
1333
+ "output_type": "display_data"
1334
+ },
1335
+ {
1336
+ "data": {
1337
+ "application/vnd.jupyter.widget-view+json": {
1338
+ "model_id": "88fb910b941344ea9b2414c3010fad47",
1339
+ "version_major": 2,
1340
+ "version_minor": 0
1341
+ },
1342
+ "text/plain": [
1343
+ "Generating Samples: 0%| | 0/10 [00:00<?, ?it/s]"
1344
+ ]
1345
+ },
1346
+ "metadata": {},
1347
+ "output_type": "display_data"
1348
+ }
1349
+ ],
1350
+ "source": [
1351
+ "from ragas.testset import TestsetGenerator\n",
1352
+ "\n",
1353
+ "generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)\n",
1354
+ "dataset = generator.generate_with_langchain_docs(docs, testset_size=10)"
1355
+ ]
1356
+ },
1357
+ {
1358
+ "cell_type": "code",
1359
+ "execution_count": 105,
1360
+ "metadata": {},
1361
+ "outputs": [
1362
+ {
1363
+ "data": {
1364
+ "text/html": [
1365
+ "<div>\n",
1366
+ "<style scoped>\n",
1367
+ " .dataframe tbody tr th:only-of-type {\n",
1368
+ " vertical-align: middle;\n",
1369
+ " }\n",
1370
+ "\n",
1371
+ " .dataframe tbody tr th {\n",
1372
+ " vertical-align: top;\n",
1373
+ " }\n",
1374
+ "\n",
1375
+ " .dataframe thead th {\n",
1376
+ " text-align: right;\n",
1377
+ " }\n",
1378
+ "</style>\n",
1379
+ "<table border=\"1\" class=\"dataframe\">\n",
1380
+ " <thead>\n",
1381
+ " <tr style=\"text-align: right;\">\n",
1382
+ " <th></th>\n",
1383
+ " <th>user_input</th>\n",
1384
+ " <th>reference_contexts</th>\n",
1385
+ " <th>reference</th>\n",
1386
+ " <th>synthesizer_name</th>\n",
1387
+ " </tr>\n",
1388
+ " </thead>\n",
1389
+ " <tbody>\n",
1390
+ " <tr>\n",
1391
+ " <th>0</th>\n",
1392
+ " <td>How does the Pain Coping Strategy Scale (PCSS-...</td>\n",
1393
+ " <td>[Linked Psychological &amp; Physical Assessment\\nP...</td>\n",
1394
+ " <td>The Pain Coping Strategy Scale (PCSS-9) measur...</td>\n",
1395
+ " <td>single_hop_specifc_query_synthesizer</td>\n",
1396
+ " </tr>\n",
1397
+ " <tr>\n",
1398
+ " <th>1</th>\n",
1399
+ " <td>Cud yu pleese explane wut the Pain Coping Stra...</td>\n",
1400
+ " <td>[Linked Psychological &amp; Physical Assessment\\nP...</td>\n",
1401
+ " <td>The Pain Coping Strategy Scale (PCSS-9) measur...</td>\n",
1402
+ " <td>single_hop_specifc_query_synthesizer</td>\n",
1403
+ " </tr>\n",
1404
+ " <tr>\n",
1405
+ " <th>2</th>\n",
1406
+ " <td>Wht is the ERI-9 and how does it relate to emo...</td>\n",
1407
+ " <td>[Financial Stress Index (FSI-6)\\nThe FSI-6 eva...</td>\n",
1408
+ " <td>The Emotional Regulation Index (ERI-9) is ment...</td>\n",
1409
+ " <td>single_hop_specifc_query_synthesizer</td>\n",
1410
+ " </tr>\n",
1411
+ " <tr>\n",
1412
+ " <th>3</th>\n",
1413
+ " <td>what cognitive load management scale do</td>\n",
1414
+ " <td>[Financial Stress Index (FSI-6)\\nThe FSI-6 eva...</td>\n",
1415
+ " <td>The Cognitive Load Management Scale (CLMS-7) m...</td>\n",
1416
+ " <td>single_hop_specifc_query_synthesizer</td>\n",
1417
+ " </tr>\n",
1418
+ " <tr>\n",
1419
+ " <th>4</th>\n",
1420
+ " <td>What does the MRI-6 assessment evaluate?</td>\n",
1421
+ " <td>[The ERI-9 assesses an individual's ability to...</td>\n",
1422
+ " <td>The MRI-6 evaluates short-term and long-term m...</td>\n",
1423
+ " <td>single_hop_specifc_query_synthesizer</td>\n",
1424
+ " </tr>\n",
1425
+ " <tr>\n",
1426
+ " <th>5</th>\n",
1427
+ " <td>what scm-6 do for social confidence and public...</td>\n",
1428
+ " <td>[The ERI-9 assesses an individual's ability to...</td>\n",
1429
+ " <td>The SCM-6 evaluates levels of confidence in so...</td>\n",
1430
+ " <td>single_hop_specifc_query_synthesizer</td>\n",
1431
+ " </tr>\n",
1432
+ " <tr>\n",
1433
+ " <th>6</th>\n",
1434
+ " <td>What does the RDMT-6 assess in terms of cognit...</td>\n",
1435
+ " <td>[Linked Psychological &amp; Physical Assessment\\nC...</td>\n",
1436
+ " <td>The RDMT-6 evaluates logical reasoning and dec...</td>\n",
1437
+ " <td>single_hop_specifc_query_synthesizer</td>\n",
1438
+ " </tr>\n",
1439
+ " <tr>\n",
1440
+ " <th>7</th>\n",
1441
+ " <td>What does the CPAI-10 assess in individuals wi...</td>\n",
1442
+ " <td>[Linked Psychological &amp; Physical Assessment\\nC...</td>\n",
1443
+ " <td>The CPAI-10 evaluates the strategies people us...</td>\n",
1444
+ " <td>single_hop_specifc_query_synthesizer</td>\n",
1445
+ " </tr>\n",
1446
+ " <tr>\n",
1447
+ " <th>8</th>\n",
1448
+ " <td>What does the CWT-7 assessment measure in term...</td>\n",
1449
+ " <td>[I feel confident when making important decisi...</td>\n",
1450
+ " <td>The CWT-7 evaluates an individual's ability to...</td>\n",
1451
+ " <td>single_hop_specifc_query_synthesizer</td>\n",
1452
+ " </tr>\n",
1453
+ " <tr>\n",
1454
+ " <th>9</th>\n",
1455
+ " <td>What does the Stamina and Endurance Index (SEI...</td>\n",
1456
+ " <td>[I feel confident when making important decisi...</td>\n",
1457
+ " <td>The Stamina and Endurance Index (SEI-8) measur...</td>\n",
1458
+ " <td>single_hop_specifc_query_synthesizer</td>\n",
1459
+ " </tr>\n",
1460
+ " </tbody>\n",
1461
+ "</table>\n",
1462
+ "</div>"
1463
+ ],
1464
+ "text/plain": [
1465
+ " user_input \\\n",
1466
+ "0 How does the Pain Coping Strategy Scale (PCSS-... \n",
1467
+ "1 Cud yu pleese explane wut the Pain Coping Stra... \n",
1468
+ "2 Wht is the ERI-9 and how does it relate to emo... \n",
1469
+ "3 what cognitive load management scale do \n",
1470
+ "4 What does the MRI-6 assessment evaluate? \n",
1471
+ "5 what scm-6 do for social confidence and public... \n",
1472
+ "6 What does the RDMT-6 assess in terms of cognit... \n",
1473
+ "7 What does the CPAI-10 assess in individuals wi... \n",
1474
+ "8 What does the CWT-7 assessment measure in term... \n",
1475
+ "9 What does the Stamina and Endurance Index (SEI... \n",
1476
+ "\n",
1477
+ " reference_contexts \\\n",
1478
+ "0 [Linked Psychological & Physical Assessment\\nP... \n",
1479
+ "1 [Linked Psychological & Physical Assessment\\nP... \n",
1480
+ "2 [Financial Stress Index (FSI-6)\\nThe FSI-6 eva... \n",
1481
+ "3 [Financial Stress Index (FSI-6)\\nThe FSI-6 eva... \n",
1482
+ "4 [The ERI-9 assesses an individual's ability to... \n",
1483
+ "5 [The ERI-9 assesses an individual's ability to... \n",
1484
+ "6 [Linked Psychological & Physical Assessment\\nC... \n",
1485
+ "7 [Linked Psychological & Physical Assessment\\nC... \n",
1486
+ "8 [I feel confident when making important decisi... \n",
1487
+ "9 [I feel confident when making important decisi... \n",
1488
+ "\n",
1489
+ " reference \\\n",
1490
+ "0 The Pain Coping Strategy Scale (PCSS-9) measur... \n",
1491
+ "1 The Pain Coping Strategy Scale (PCSS-9) measur... \n",
1492
+ "2 The Emotional Regulation Index (ERI-9) is ment... \n",
1493
+ "3 The Cognitive Load Management Scale (CLMS-7) m... \n",
1494
+ "4 The MRI-6 evaluates short-term and long-term m... \n",
1495
+ "5 The SCM-6 evaluates levels of confidence in so... \n",
1496
+ "6 The RDMT-6 evaluates logical reasoning and dec... \n",
1497
+ "7 The CPAI-10 evaluates the strategies people us... \n",
1498
+ "8 The CWT-7 evaluates an individual's ability to... \n",
1499
+ "9 The Stamina and Endurance Index (SEI-8) measur... \n",
1500
+ "\n",
1501
+ " synthesizer_name \n",
1502
+ "0 single_hop_specifc_query_synthesizer \n",
1503
+ "1 single_hop_specifc_query_synthesizer \n",
1504
+ "2 single_hop_specifc_query_synthesizer \n",
1505
+ "3 single_hop_specifc_query_synthesizer \n",
1506
+ "4 single_hop_specifc_query_synthesizer \n",
1507
+ "5 single_hop_specifc_query_synthesizer \n",
1508
+ "6 single_hop_specifc_query_synthesizer \n",
1509
+ "7 single_hop_specifc_query_synthesizer \n",
1510
+ "8 single_hop_specifc_query_synthesizer \n",
1511
+ "9 single_hop_specifc_query_synthesizer "
1512
+ ]
1513
+ },
1514
+ "execution_count": 105,
1515
+ "metadata": {},
1516
+ "output_type": "execute_result"
1517
+ }
1518
+ ],
1519
+ "source": [
1520
+ "dataset.to_pandas()"
1521
+ ]
1522
+ },
1523
+ {
1524
+ "cell_type": "markdown",
1525
+ "metadata": {},
1526
+ "source": [
1527
+ "Eval with base model"
1528
+ ]
1529
+ },
1530
+ {
1531
+ "cell_type": "code",
1532
+ "execution_count": 106,
1533
+ "metadata": {},
1534
+ "outputs": [],
1535
+ "source": [
1536
+ "for test_row in dataset:\n",
1537
+ " response = base_rag_chain.invoke({\"question\" : test_row.eval_sample.user_input})\n",
1538
+ " test_row.eval_sample.response = response[\"response\"]\n",
1539
+ " test_row.eval_sample.retrieved_contexts = [context.page_content for context in response[\"context\"]]"
1540
+ ]
1541
+ },
1542
+ {
1543
+ "cell_type": "code",
1544
+ "execution_count": 107,
1545
+ "metadata": {},
1546
+ "outputs": [],
1547
+ "source": [
1548
+ "from ragas.llms import LangchainLLMWrapper\n",
1549
+ "\n",
1550
+ "evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o\"))"
1551
+ ]
1552
+ },
1553
+ {
1554
+ "cell_type": "code",
1555
+ "execution_count": 108,
1556
+ "metadata": {},
1557
+ "outputs": [],
1558
+ "source": [
1559
+ "from ragas import EvaluationDataset\n",
1560
+ "\n",
1561
+ "evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())"
1562
+ ]
1563
+ },
1564
+ {
1565
+ "cell_type": "code",
1566
+ "execution_count": 109,
1567
+ "metadata": {},
1568
+ "outputs": [
1569
+ {
1570
+ "data": {
1571
+ "application/vnd.jupyter.widget-view+json": {
1572
+ "model_id": "57340d6c46c347e19fecdc4490574a8b",
1573
+ "version_major": 2,
1574
+ "version_minor": 0
1575
+ },
1576
+ "text/plain": [
1577
+ "Evaluating: 0%| | 0/60 [00:00<?, ?it/s]"
1578
+ ]
1579
+ },
1580
+ "metadata": {},
1581
+ "output_type": "display_data"
1582
+ },
1583
+ {
1584
+ "name": "stderr",
1585
+ "output_type": "stream",
1586
+ "text": [
1587
+ "Exception raised in Job[13]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28698, Requested 2725. Please try again in 2.846s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1588
+ "Exception raised in Job[22]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29211, Requested 2254. Please try again in 2.93s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1589
+ "Exception raised in Job[19]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29563, Requested 2685. Please try again in 4.496s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1590
+ "Exception raised in Job[24]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29189, Requested 2555. Please try again in 3.488s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1591
+ "Exception raised in Job[28]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29993, Requested 2254. Please try again in 4.494s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1592
+ "Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29502, Requested 2743. Please try again in 4.49s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1593
+ "Exception raised in Job[30]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28840, Requested 2574. Please try again in 2.828s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1594
+ "Exception raised in Job[25]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29928, Requested 2511. Please try again in 4.878s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1595
+ "Exception raised in Job[7]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29823, Requested 2809. Please try again in 5.264s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1596
+ "Exception raised in Job[31]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29637, Requested 2665. Please try again in 4.604s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1597
+ "Exception raised in Job[36]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29185, Requested 2560. Please try again in 3.49s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1598
+ "Exception raised in Job[11]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29749, Requested 1558. Please try again in 2.614s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1599
+ "Exception raised in Job[5]: TimeoutError()\n",
1600
+ "Exception raised in Job[17]: TimeoutError()\n",
1601
+ "Exception raised in Job[43]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29678, Requested 2514. Please try again in 4.384s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1602
+ "Exception raised in Job[37]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28940, Requested 2499. Please try again in 2.878s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1603
+ "Exception raised in Job[40]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28657, Requested 2254. Please try again in 1.822s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n"
1604
+ ]
1605
+ },
1606
+ {
1607
+ "data": {
1608
+ "text/plain": [
1609
+ "{'context_recall': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7540, 'answer_relevancy': 0.9481, 'context_entity_recall': 0.8095, 'noise_sensitivity_relevant': 0.1973}"
1610
+ ]
1611
+ },
1612
+ "execution_count": 109,
1613
+ "metadata": {},
1614
+ "output_type": "execute_result"
1615
+ }
1616
+ ],
1617
+ "source": [
1618
+ "from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity\n",
1619
+ "from ragas import evaluate, RunConfig\n",
1620
+ "\n",
1621
+ "custom_run_config = RunConfig(timeout=360)\n",
1622
+ "\n",
1623
+ "result = evaluate(\n",
1624
+ " dataset=evaluation_dataset,\n",
1625
+ " metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],\n",
1626
+ " llm=evaluator_llm,\n",
1627
+ " run_config=custom_run_config\n",
1628
+ ")\n",
1629
+ "result"
1630
+ ]
1631
+ },
1632
+ {
1633
+ "cell_type": "markdown",
1634
+ "metadata": {},
1635
+ "source": [
1636
+ "Evaluate the Fine tuned.\n"
1637
+ ]
1638
+ },
1639
+ {
1640
+ "cell_type": "code",
1641
+ "execution_count": 110,
1642
+ "metadata": {},
1643
+ "outputs": [],
1644
+ "source": [
1645
+ "for test_row in dataset:\n",
1646
+ " response = finetune_rag_chain.invoke({\"question\" : test_row.eval_sample.user_input})\n",
1647
+ " test_row.eval_sample.response = response[\"response\"]\n",
1648
+ " test_row.eval_sample.retrieved_contexts = [context.page_content for context in response[\"context\"]]"
1649
+ ]
1650
+ },
1651
+ {
1652
+ "cell_type": "code",
1653
+ "execution_count": 111,
1654
+ "metadata": {},
1655
+ "outputs": [],
1656
+ "source": [
1657
+ "evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())"
1658
+ ]
1659
+ },
1660
+ {
1661
+ "cell_type": "code",
1662
+ "execution_count": 112,
1663
+ "metadata": {},
1664
+ "outputs": [
1665
+ {
1666
+ "data": {
1667
+ "application/vnd.jupyter.widget-view+json": {
1668
+ "model_id": "758cb2b2b6df49e88c88b1fca6c09f3c",
1669
+ "version_major": 2,
1670
+ "version_minor": 0
1671
+ },
1672
+ "text/plain": [
1673
+ "Evaluating: 0%| | 0/60 [00:00<?, ?it/s]"
1674
+ ]
1675
+ },
1676
+ "metadata": {},
1677
+ "output_type": "display_data"
1678
+ },
1679
+ {
1680
+ "name": "stderr",
1681
+ "output_type": "stream",
1682
+ "text": [
1683
+ "Exception raised in Job[22]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28950, Requested 2254. Please try again in 2.408s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1684
+ "Exception raised in Job[16]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28949, Requested 2254. Please try again in 2.406s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1685
+ "Exception raised in Job[19]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28567, Requested 2751. Please try again in 2.636s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1686
+ "Exception raised in Job[25]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28831, Requested 2511. Please try again in 2.684s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1687
+ "Exception raised in Job[28]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29242, Requested 2254. Please try again in 2.992s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1688
+ "Exception raised in Job[24]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29683, Requested 2555. Please try again in 4.476s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1689
+ "Exception raised in Job[11]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29672, Requested 1515. Please try again in 2.374s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1690
+ "Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29901, Requested 2743. Please try again in 5.288s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1691
+ "Exception raised in Job[30]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29651, Requested 2574. Please try again in 4.45s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1692
+ "Exception raised in Job[7]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29659, Requested 2771. Please try again in 4.86s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1693
+ "Exception raised in Job[34]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28744, Requested 2265. Please try again in 2.018s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1694
+ "Exception raised in Job[31]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29754, Requested 2665. Please try again in 4.838s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1695
+ "Exception raised in Job[5]: TimeoutError()\n",
1696
+ "Exception raised in Job[36]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29775, Requested 2560. Please try again in 4.67s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1697
+ "Exception raised in Job[17]: TimeoutError()\n",
1698
+ "Exception raised in Job[23]: TimeoutError()\n",
1699
+ "Exception raised in Job[40]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28967, Requested 2254. Please try again in 2.442s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1700
+ "Exception raised in Job[46]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28976, Requested 2250. Please try again in 2.452s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1701
+ "Exception raised in Job[37]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28735, Requested 2499. Please try again in 2.468s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n"
1702
+ ]
1703
+ },
1704
+ {
1705
+ "data": {
1706
+ "text/plain": [
1707
+ "{'context_recall': 1.0000, 'faithfulness': 0.8500, 'factual_correctness': 0.7220, 'answer_relevancy': 0.9481, 'context_entity_recall': 0.7917, 'noise_sensitivity_relevant': 0.1111}"
1708
+ ]
1709
+ },
1710
+ "execution_count": 112,
1711
+ "metadata": {},
1712
+ "output_type": "execute_result"
1713
+ }
1714
+ ],
1715
+ "source": [
1716
+ "result = evaluate(\n",
1717
+ " dataset=evaluation_dataset,\n",
1718
+ " metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],\n",
1719
+ " llm=evaluator_llm,\n",
1720
+ " run_config=custom_run_config\n",
1721
+ ")\n",
1722
+ "result"
1723
+ ]
1724
+ },
1725
+ {
1726
+ "cell_type": "markdown",
1727
+ "metadata": {},
1728
+ "source": [
1729
+ "Evaluate the openai model"
1730
+ ]
1731
+ },
1732
+ {
1733
+ "cell_type": "code",
1734
+ "execution_count": 116,
1735
+ "metadata": {},
1736
+ "outputs": [],
1737
+ "source": [
1738
+ "for test_row in dataset:\n",
1739
+ " response = openai_rag_chain.invoke({\"question\" : test_row.eval_sample.user_input})\n",
1740
+ " test_row.eval_sample.response = response[\"response\"]\n",
1741
+ " test_row.eval_sample.retrieved_contexts = [context.page_content for context in response[\"context\"]]"
1742
+ ]
1743
+ },
1744
+ {
1745
+ "cell_type": "code",
1746
+ "execution_count": 117,
1747
+ "metadata": {},
1748
+ "outputs": [],
1749
+ "source": [
1750
+ "evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())"
1751
+ ]
1752
+ },
1753
+ {
1754
+ "cell_type": "code",
1755
+ "execution_count": 118,
1756
+ "metadata": {},
1757
+ "outputs": [
1758
+ {
1759
+ "data": {
1760
+ "application/vnd.jupyter.widget-view+json": {
1761
+ "model_id": "a3f59e7e78294492a701763a859d6239",
1762
+ "version_major": 2,
1763
+ "version_minor": 0
1764
+ },
1765
+ "text/plain": [
1766
+ "Evaluating: 0%| | 0/60 [00:00<?, ?it/s]"
1767
+ ]
1768
+ },
1769
+ "metadata": {},
1770
+ "output_type": "display_data"
1771
+ },
1772
+ {
1773
+ "name": "stderr",
1774
+ "output_type": "stream",
1775
+ "text": [
1776
+ "Exception raised in Job[30]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28587, Requested 2574. Please try again in 2.322s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1777
+ "Exception raised in Job[25]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29460, Requested 2782. Please try again in 4.484s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1778
+ "Exception raised in Job[1]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29365, Requested 2991. Please try again in 4.712s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1779
+ "Exception raised in Job[24]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29067, Requested 2826. Please try again in 3.786s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1780
+ "Exception raised in Job[13]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28945, Requested 2968. Please try again in 3.826s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1781
+ "Exception raised in Job[22]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29841, Requested 2525. Please try again in 4.732s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1782
+ "Exception raised in Job[19]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29512, Requested 2895. Please try again in 4.814s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1783
+ "Exception raised in Job[11]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29581, Requested 1650. Please try again in 2.462s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1784
+ "Exception raised in Job[7]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29318, Requested 3175. Please try again in 4.986s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1785
+ "Exception raised in Job[28]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 28799, Requested 2525. Please try again in 2.648s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1786
+ "Exception raised in Job[5]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29787, Requested 1465. Please try again in 2.504s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1787
+ "Exception raised in Job[34]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29638, Requested 2265. Please try again in 3.805s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1788
+ "Exception raised in Job[31]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29242, Requested 2736. Please try again in 3.956s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n",
1789
+ "Exception raised in Job[35]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-TU5fm55zJrncrgPcg3lg23B6 on tokens per min (TPM): Limit 30000, Used 29647, Requested 1516. Please try again in 2.326s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})\n"
1790
+ ]
1791
+ },
1792
+ {
1793
+ "data": {
1794
+ "text/plain": [
1795
+ "{'context_recall': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7540, 'answer_relevancy': 0.9463, 'context_entity_recall': 0.8095, 'noise_sensitivity_relevant': 0.3095}"
1796
+ ]
1797
+ },
1798
+ "execution_count": 118,
1799
+ "metadata": {},
1800
+ "output_type": "execute_result"
1801
+ }
1802
+ ],
1803
+ "source": [
1804
+ "result = evaluate(\n",
1805
+ " dataset=evaluation_dataset,\n",
1806
+ " metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],\n",
1807
+ " llm=evaluator_llm,\n",
1808
+ " run_config=custom_run_config\n",
1809
+ ")\n",
1810
+ "result"
1811
+ ]
1812
+ },
1813
+ {
1814
+ "cell_type": "markdown",
1815
+ "metadata": {},
1816
+ "source": []
1817
+ },
1818
+ {
1819
+ "cell_type": "markdown",
1820
+ "metadata": {},
1821
+ "source": [
1822
+ "\n",
1823
+ "Base model evaluation\n",
1824
+ "{'context_recall': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7540, 'answer_relevancy': 0.9481, 'context_entity_recall': 0.8095, 'noise_sensitivity_relevant': 0.1973}\n",
1825
+ "\n",
1826
+ "Finetuned model\n",
1827
+ "{'context_recall': 1.0000, 'faithfulness': 0.8500, 'factual_correctness': 0.7220, 'answer_relevancy': 0.9481, 'context_entity_recall': 0.7917, 'noise_sensitivity_relevant': 0.1111}\n",
1828
+ "\n",
1829
+ "\n",
1830
+ "Openai model\n",
1831
+ "{'context_recall': 1.0000, 'faithfulness': 1.0000, 'factual_correctness': 0.7540, 'answer_relevancy': 0.9463, 'context_entity_recall': 0.8095, 'noise_sensitivity_relevant': 0.3095}\n",
1832
+ "\n",
1833
+ "\n",
1834
+ "\n",
1835
+ "Base snowflake model and OpenAI are very similar with the openai model performing slightly better for noise sensitivity.\n",
1836
+ "The finetuned snowflak model perform does not perform better in most case though it reduces noise sensitivity."
1837
+ ]
1838
+ }
1839
+ ],
1840
+ "metadata": {
1841
+ "kernelspec": {
1842
+ "display_name": ".venv",
1843
+ "language": "python",
1844
+ "name": "python3"
1845
+ },
1846
+ "language_info": {
1847
+ "codemirror_mode": {
1848
+ "name": "ipython",
1849
+ "version": 3
1850
+ },
1851
+ "file_extension": ".py",
1852
+ "mimetype": "text/x-python",
1853
+ "name": "python",
1854
+ "nbconvert_exporter": "python",
1855
+ "pygments_lexer": "ipython3",
1856
+ "version": "3.13.1"
1857
+ }
1858
+ },
1859
+ "nbformat": 4,
1860
+ "nbformat_minor": 2
1861
+ }
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Get a distribution that has uv already installed
2
+ FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim
3
+
4
+ # Add user - this is the user that will run the app
5
+ # If you do not set user, the app will run as root (undesirable)
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+
9
+ # Set the home directory and path
10
+ ENV HOME=/home/user \
11
+ PATH=/home/user/.local/bin:$PATH
12
+
13
+ ENV UVICORN_WS_PROTOCOL=websockets
14
+
15
+
16
+ # Set the working directory
17
+ WORKDIR $HOME/app
18
+
19
+ # Copy the app to the container
20
+ COPY --chown=user . $HOME/app
21
+
22
+ # Install the dependencies
23
+ # RUN uv sync --frozen
24
+ RUN uv sync
25
+
26
+ # Expose the port
27
+ EXPOSE 7860
28
+
29
+ # Run the app
30
+ CMD ["uv", "run", "chainlit", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: midterm_poc
3
+ emoji: 🌖
4
+ colorFrom: gray
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ short_description: midterm POC
9
+ license: apache-2.0
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import json
4
+ import pandas as pd
5
+ import chainlit as cl
6
+ from dotenv import load_dotenv
7
+ from langchain_core.documents import Document
8
+ from langchain_community.document_loaders import PyMuPDFLoader
9
+ from langchain_experimental.text_splitter import SemanticChunker
10
+ from langchain_community.vectorstores import Qdrant
11
+ from langchain_community.embeddings import HuggingFaceEmbeddings
12
+ from langchain_core.output_parsers import StrOutputParser
13
+ from langchain_openai import ChatOpenAI
14
+ from langchain_core.prompts import ChatPromptTemplate
15
+ from langgraph.graph import START, StateGraph
16
+ from langchain.tools import tool
17
+ from langchain.schema import HumanMessage
18
+ from typing_extensions import List, TypedDict
19
+ from operator import itemgetter
20
+
21
+ # Load environment variables
22
+ load_dotenv()
23
+
24
+ # Define paths
25
+ UPLOAD_PATH = "upload/"
26
+ OUTPUT_PATH = "output/"
27
+ os.makedirs(UPLOAD_PATH, exist_ok=True)
28
+ os.makedirs(OUTPUT_PATH, exist_ok=True)
29
+
30
+ # Initialize embeddings model
31
+ model_id = "Snowflake/snowflake-arctic-embed-m"
32
+ embedding_model = HuggingFaceEmbeddings(model_name=model_id)
33
+
34
+ # Define semantic chunker
35
+ semantic_splitter = SemanticChunker(embedding_model)
36
+
37
+ # Initialize LLM
38
+ llm = ChatOpenAI(model="gpt-4o-mini")
39
+
40
+ # Define RAG prompt
41
+ export_prompt = """
42
+ CONTEXT:
43
+ {context}
44
+
45
+ QUERY:
46
+ {question}
47
+
48
+ You are a helpful assistant. Use the available context to answer the question.
49
+
50
+ Between these two files containing protocols, identify and match **entire assessment sections** based on conceptual similarity. Do NOT match individual questions.
51
+
52
+ ### **Output Format:**
53
+ Return the response in **valid JSON format** structured as a list of dictionaries, where each dictionary contains:
54
+ [
55
+ {{
56
+ "Derived Description": "A short name for the matched concept",
57
+ "Protocol_1": "Protocol 1 - Matching Element",
58
+ "Protocol_2": "Protocol 2 - Matching Element"
59
+ }},
60
+ ...
61
+ ]
62
+ ### **Example Output:**
63
+ [
64
+ {{
65
+ "Derived Description": "Pain Coping Strategies",
66
+ "Protocol_1": "Pain Coping Strategy Scale (PCSS-9)",
67
+ "Protocol_2": "Chronic Pain Adjustment Index (CPAI-10)"
68
+ }},
69
+ {{
70
+ "Derived Description": "Work Stress and Fatigue",
71
+ "Protocol_1": "Work-Related Stress Scale (WRSS-8)",
72
+ "Protocol_2": "Occupational Fatigue Index (OFI-7)"
73
+ }},
74
+ ...
75
+ ]
76
+
77
+ ### Rules:
78
+ 1. Only output **valid JSON** with no explanations, summaries, or markdown formatting.
79
+ 2. Ensure each entry in the JSON list represents a single matched data element from the two protocols.
80
+ 3. If no matching element is found in a protocol, leave it empty ("").
81
+ 4. **Do NOT include headers, explanations, or additional formatting**—only return the raw JSON list.
82
+ 5. It should include all the elements in the two protocols.
83
+ 6. If it cannot match the element, create the row and include the protocol it did find and put "could not match" in the other protocol column.
84
+ 7. protocol should be the between
85
+ """
86
+
87
+ compare_export_prompt = ChatPromptTemplate.from_template(export_prompt)
88
+
89
+ QUERY_PROMPT = """
90
+ You are a helpful assistant. Use the available context to answer the question concisely and informatively.
91
+
92
+ CONTEXT:
93
+ {context}
94
+
95
+ QUERY:
96
+ {question}
97
+
98
+ Provide a natural-language response using the given information. If you do not know the answer, say so.
99
+ """
100
+
101
+ query_prompt = ChatPromptTemplate.from_template(QUERY_PROMPT)
102
+
103
+
104
+ @tool
105
+ def document_query_tool(question: str) -> str:
106
+ """Retrieves relevant document sections and answers questions based on the uploaded documents."""
107
+
108
+ retriever = cl.user_session.get("qdrant_retriever")
109
+ if not retriever:
110
+ return "Error: No documents available for retrieval. Please upload documents first."
111
+
112
+ # Retrieve context from the vector database
113
+ retrieved_docs = retriever.invoke(question)
114
+ docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
115
+
116
+ # Generate response using the natural query prompt
117
+ messages = query_prompt.format_messages(question=question, context=docs_content)
118
+ response = llm.invoke(messages)
119
+
120
+ return {
121
+ "messages": [HumanMessage(content=response.content)],
122
+ "context": retrieved_docs
123
+ }
124
+
125
+
126
+
127
+ @tool
128
+ def document_comparison_tool(question: str) -> str:
129
+ """Compares the two uploaded documents, identifies matched elements, exports them as JSON, formats into CSV, and provides a download link."""
130
+
131
+ # Retrieve the vector database retriever
132
+ retriever = cl.user_session.get("qdrant_retriever")
133
+ if not retriever:
134
+ return "Error: No documents available for retrieval. Please upload two PDF files first."
135
+
136
+ # Process query using RAG
137
+ rag_chain = (
138
+ {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
139
+ | compare_export_prompt | llm | StrOutputParser()
140
+ )
141
+ response_text = rag_chain.invoke({"question": question})
142
+
143
+ # Parse response and save as CSV
144
+ try:
145
+ structured_data = json.loads(response_text)
146
+ if not structured_data:
147
+ return "Error: No matched elements found."
148
+
149
+ # Define output file path
150
+ file_path = os.path.join(OUTPUT_PATH, "comparison_results.csv")
151
+
152
+ # Save to CSV
153
+ df = pd.DataFrame(structured_data, columns=["Derived Description", "Protocol_1", "Protocol_2"])
154
+ df.to_csv(file_path, index=False)
155
+
156
+ return file_path # Return path to the CSV file
157
+
158
+ except json.JSONDecodeError:
159
+ return "Error: Response is not valid JSON."
160
+
161
+
162
+
163
+ tool_belt = [document_query_tool, document_comparison_tool]
164
+ model = ChatOpenAI(model="gpt-4o", temperature=0)
165
+ model = model.bind_tools(tool_belt)
166
+
167
+ async def process_files(files: list[cl.File]):
168
+ documents_with_metadata = []
169
+ for file in files:
170
+ file_path = os.path.join(UPLOAD_PATH, file.name)
171
+ shutil.copyfile(file.path, file_path)
172
+
173
+ loader = PyMuPDFLoader(file_path)
174
+ documents = loader.load()
175
+
176
+ for doc in documents:
177
+ source_name = file.name
178
+ chunks = semantic_splitter.split_text(doc.page_content)
179
+ for chunk in chunks:
180
+ doc_chunk = Document(page_content=chunk, metadata={"source": source_name})
181
+ documents_with_metadata.append(doc_chunk)
182
+
183
+ if documents_with_metadata:
184
+ qdrant_vectorstore = Qdrant.from_documents(
185
+ documents_with_metadata,
186
+ embedding_model,
187
+ location=":memory:",
188
+ collection_name="document_comparison",
189
+ )
190
+ return qdrant_vectorstore.as_retriever()
191
+ return None
192
+
193
+ @cl.on_chat_start
194
+ async def start():
195
+ cl.user_session.set("qdrant_retriever", None)
196
+ files = await cl.AskFileMessage(
197
+ content="Please upload **two PDF files** for comparison:",
198
+ accept=["application/pdf"],
199
+ max_files=2
200
+ ).send()
201
+
202
+ if len(files) != 2:
203
+ await cl.Message("Error: You must upload exactly two PDF files.").send()
204
+ return
205
+
206
+ retriever = await process_files(files)
207
+ if retriever:
208
+ cl.user_session.set("qdrant_retriever", retriever)
209
+ await cl.Message("Files uploaded and processed successfully! You can now enter your query.").send()
210
+ else:
211
+ await cl.Message("Error: Unable to process files. Please try again.").send()
212
+
213
+ @cl.on_message
214
+ async def handle_message(message: cl.Message):
215
+ user_input = message.content.lower()
216
+
217
+ # If the user asks for a comparison, run the document_comparison_tool
218
+ if "compare" in user_input or "export" in user_input:
219
+ file_path = document_comparison_tool.invoke(user_input)
220
+ if file_path and file_path.endswith(".csv"):
221
+ await cl.Message(
222
+ content="Comparison complete! Download the CSV below:",
223
+ elements=[cl.File(name="comparison_results.csv", path=file_path, display="inline")],
224
+ ).send()
225
+ else:
226
+ await cl.Message(file_path).send()
227
+ else:
228
+ response_text = document_query_tool.invoke(user_input)
229
+ await cl.Message(response_text["messages"][0].content).send()
chainlit.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Welcome to Chat with Your Text File
2
+ With this application, you can compare uploaded text files
example_files/florida_protocol.pdf ADDED
Binary file (3.97 kB). View file
 
example_files/matching_data_elements.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Derived Description,Protocol_1,Protocol_2
2
+ Pain Coping Strategies,Pain Coping Strategy Scale (PCSS-9),Pain Management Techniques
3
+ Work Stress Assessment,Work-Related Stress Scale (WRSS-8),Occupational Fatigue Index (OFI-7)
4
+ Decision-Making Confidence,Decision-Making Confidence Scale (DMCS-6),Rational Decision-Making Test (RDMT-6)
5
+ Cognitive Task Management,Cognitive Load and Task Management,Cognitive and Emotional Resilience
6
+ Emotional Resilience and Regulation,Emotional Resilience Score (ERS-9),Emotional Regulation Index (ERI-9)
7
+ Social Engagement and Communication,Public Speaking and Social Engagement (PSSE-6),could not match
example_files/wyoming_protocol.pdf ADDED
Binary file (4.36 kB). View file
 
pyproject.toml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "midterm_poc"
3
+ version = "0.1.0"
4
+ description = "midterm POC huggingface project"
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ dependencies = [
8
+ "chainlit",
9
+ "langchain",
10
+ "langchain_community",
11
+ "tqdm",
12
+ "PyMuPDF",
13
+ "openai>=1.59.9",
14
+ "pypdf2>=3.0.1",
15
+ "websockets",
16
+ "qdrant-client",
17
+ "langchain",
18
+ "langchain-community",
19
+ "langchain-openai",
20
+ "unstructured",
21
+ "pymupdf",
22
+ "qdrant-client",
23
+ "langgraph",
24
+ "langchain-core",
25
+ "langchain-openai",
26
+ "langchain-community",
27
+ "ragas",
28
+ "langchain_experimental",
29
+ ###review
30
+ ### cleanup
31
+ "langchain-core==0.3.31",
32
+ "langchain==0.3.15",
33
+ "langchain-community==0.3.15",
34
+ "langchain-openai==0.3.1",
35
+ "langchain-qdrant==0.2.0",
36
+ "langchain-text-splitters>=0.3.5",
37
+ "langchain-huggingface==0.1.2",
38
+ #"langgraph>=0.2.67",
39
+ "langsmith>=0.3.1",
40
+ "lxml>=5.3.0",
41
+ ###notebook
42
+ "ipykernel",
43
+ "ipywidgets",
44
+ "IProgress",
45
+ "huggingface_hub",
46
+ "wandb",
47
+ "transformers",
48
+ "accelerate",
49
+ "torch",
50
+ #### ragas
51
+ #"ragas==0.2.10"
52
+ #"FAISS"
53
+ #remove only used for testing
54
+ "cohere",
55
+ "langchain_cohere",
56
+ "arxiv"
57
+ ]
uv.lock ADDED
The diff for this file is too large to render. See raw diff