Spaces:
Runtime error
Runtime error
Commit
·
70e2d85
1
Parent(s):
5ea4259
resolved pdf reader issue
Browse files- config/model_config_advanced.yml +1 -1
- database/mock_qna.sqlite +1 -1
- models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/data_level0.bin +3 -0
- models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/header.bin +3 -0
- models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/index_metadata.pickle +3 -0
- models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/length.bin +3 -0
- models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/link_lists.bin +3 -0
- models/chroma_db_advanced_corrected/chroma.sqlite3 +3 -0
- notebooks/002_persisted-embedding-model-advanced.ipynb +118 -148
- notebooks/007_test_hi_content_engine.ipynb +41 -45
- requirements.txt +1 -0
config/model_config_advanced.yml
CHANGED
|
@@ -11,7 +11,7 @@ embeddings:
|
|
| 11 |
fine_tuned_embedding_model: 'local:models/fine-tuned-embeddings-advanced'
|
| 12 |
|
| 13 |
vector_store:
|
| 14 |
-
persisted_path: './models/
|
| 15 |
|
| 16 |
questionaire_data:
|
| 17 |
db_path: './database/mock_qna.sqlite'
|
|
|
|
| 11 |
fine_tuned_embedding_model: 'local:models/fine-tuned-embeddings-advanced'
|
| 12 |
|
| 13 |
vector_store:
|
| 14 |
+
persisted_path: './models/chroma_db_advanced_corrected'
|
| 15 |
|
| 16 |
questionaire_data:
|
| 17 |
db_path: './database/mock_qna.sqlite'
|
database/mock_qna.sqlite
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 40960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d7f3f8c146d46df19f3dd8a4846ccbf63f88e6dd914b67a0c5c689eba21a558d
|
| 3 |
size 40960
|
models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c7deba9397301d1ea3f7f5edcb06162bc4797984100c456b128303c58b95c79
|
| 3 |
+
size 31844000
|
models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fdac383bac7a3236c814029cee7525e8018d396f9cd0d15b97a22a3af9090d8
|
| 3 |
+
size 100
|
models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/index_metadata.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:175eec7edc141af44f996bba0295ddb71d3fd54c39b1352539ede6753f00e834
|
| 3 |
+
size 1100226
|
models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:68f24a06f63a85c1b082283b0be703475bf9023e9f4f7e8b3bd4bca276af3b8f
|
| 3 |
+
size 76000
|
models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/link_lists.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:224e61d8e99cd63b57c6e9204f3136aba4a43f9bb15d7dcd5eb181c9378829f8
|
| 3 |
+
size 167188
|
models/chroma_db_advanced_corrected/chroma.sqlite3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cf347334a74a91a9d24d298ad1b12fa043579fe0b915cf7bc558f3d2acafca5c
|
| 3 |
+
size 299061248
|
notebooks/002_persisted-embedding-model-advanced.ipynb
CHANGED
|
@@ -10,7 +10,7 @@
|
|
| 10 |
},
|
| 11 |
{
|
| 12 |
"cell_type": "code",
|
| 13 |
-
"execution_count":
|
| 14 |
"id": "7de9c591-5a77-4bbe-80f1-4897e15f0b97",
|
| 15 |
"metadata": {},
|
| 16 |
"outputs": [],
|
|
@@ -33,24 +33,16 @@
|
|
| 33 |
"import nest_asyncio\n",
|
| 34 |
"nest_asyncio.apply()\n",
|
| 35 |
"\n",
|
| 36 |
-
"import time"
|
|
|
|
| 37 |
]
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"cell_type": "code",
|
| 41 |
-
"execution_count":
|
| 42 |
"id": "978152ce-4d87-44b5-b521-dbaff60b32b0",
|
| 43 |
"metadata": {},
|
| 44 |
-
"outputs": [
|
| 45 |
-
{
|
| 46 |
-
"name": "stderr",
|
| 47 |
-
"output_type": "stream",
|
| 48 |
-
"text": [
|
| 49 |
-
"199it [00:00, 8821.71it/s]\n",
|
| 50 |
-
"200it [00:00, 12584.17it/s]\n"
|
| 51 |
-
]
|
| 52 |
-
}
|
| 53 |
-
],
|
| 54 |
"source": [
|
| 55 |
"split_content(filepath=\"../raw_documents/answers.txt\", \n",
|
| 56 |
" separator=\"\\n\\n\", \n",
|
|
@@ -63,7 +55,7 @@
|
|
| 63 |
},
|
| 64 |
{
|
| 65 |
"cell_type": "code",
|
| 66 |
-
"execution_count":
|
| 67 |
"id": "d925371b-8777-4f5b-a7f2-ec3f228ef266",
|
| 68 |
"metadata": {},
|
| 69 |
"outputs": [],
|
|
@@ -84,41 +76,64 @@
|
|
| 84 |
{
|
| 85 |
"cell_type": "code",
|
| 86 |
"execution_count": null,
|
| 87 |
-
"id": "
|
| 88 |
"metadata": {},
|
| 89 |
"outputs": [],
|
| 90 |
"source": []
|
| 91 |
},
|
| 92 |
{
|
| 93 |
"cell_type": "code",
|
| 94 |
-
"execution_count":
|
| 95 |
"id": "3e65dff6-77b6-4be8-8857-5cecf3a035bb",
|
| 96 |
"metadata": {},
|
| 97 |
"outputs": [],
|
| 98 |
"source": [
|
| 99 |
"# load some documents\n",
|
| 100 |
-
"
|
| 101 |
-
"
|
| 102 |
-
"
|
| 103 |
-
"
|
| 104 |
-
"
|
| 105 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
]
|
| 107 |
},
|
| 108 |
{
|
| 109 |
"cell_type": "code",
|
| 110 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
"id": "bd86b3f5-1dfc-4257-bd9c-86d34f02398d",
|
| 112 |
"metadata": {},
|
| 113 |
"outputs": [],
|
| 114 |
"source": [
|
| 115 |
"# initialize client, setting path to save data\n",
|
| 116 |
-
"db = chromadb.PersistentClient(path=\"../models/
|
| 117 |
]
|
| 118 |
},
|
| 119 |
{
|
| 120 |
"cell_type": "code",
|
| 121 |
-
"execution_count":
|
| 122 |
"id": "f568ce7b-bcbf-455c-acf1-6c2cae129fed",
|
| 123 |
"metadata": {},
|
| 124 |
"outputs": [],
|
|
@@ -129,7 +144,7 @@
|
|
| 129 |
},
|
| 130 |
{
|
| 131 |
"cell_type": "code",
|
| 132 |
-
"execution_count":
|
| 133 |
"id": "ed0b018e-1982-46b2-b1b4-04f5c0ce8672",
|
| 134 |
"metadata": {},
|
| 135 |
"outputs": [],
|
|
@@ -148,18 +163,10 @@
|
|
| 148 |
},
|
| 149 |
{
|
| 150 |
"cell_type": "code",
|
| 151 |
-
"execution_count":
|
| 152 |
"id": "0946b6ce-96ab-44de-ad75-e424a8429f67",
|
| 153 |
"metadata": {},
|
| 154 |
-
"outputs": [
|
| 155 |
-
{
|
| 156 |
-
"name": "stdout",
|
| 157 |
-
"output_type": "stream",
|
| 158 |
-
"text": [
|
| 159 |
-
"LLM is explicitly disabled. Using MockLLM.\n"
|
| 160 |
-
]
|
| 161 |
-
}
|
| 162 |
-
],
|
| 163 |
"source": [
|
| 164 |
"Settings.llm = None\n",
|
| 165 |
"Settings.chunk_size = 1024\n",
|
|
@@ -169,31 +176,20 @@
|
|
| 169 |
},
|
| 170 |
{
|
| 171 |
"cell_type": "code",
|
| 172 |
-
"execution_count":
|
| 173 |
"id": "b8c73a2c-1129-406a-8046-085afcaf9cbb",
|
| 174 |
"metadata": {},
|
| 175 |
"outputs": [],
|
| 176 |
"source": [
|
| 177 |
-
"nodes = Settings.node_parser.get_nodes_from_documents(
|
| 178 |
]
|
| 179 |
},
|
| 180 |
{
|
| 181 |
"cell_type": "code",
|
| 182 |
-
"execution_count":
|
| 183 |
"id": "75f1c76f-d3e5-4b69-818c-98865adb1457",
|
| 184 |
"metadata": {},
|
| 185 |
-
"outputs": [
|
| 186 |
-
{
|
| 187 |
-
"data": {
|
| 188 |
-
"text/plain": [
|
| 189 |
-
"6814"
|
| 190 |
-
]
|
| 191 |
-
},
|
| 192 |
-
"execution_count": 13,
|
| 193 |
-
"metadata": {},
|
| 194 |
-
"output_type": "execute_result"
|
| 195 |
-
}
|
| 196 |
-
],
|
| 197 |
"source": [
|
| 198 |
"len(nodes)"
|
| 199 |
]
|
|
@@ -208,7 +204,7 @@
|
|
| 208 |
},
|
| 209 |
{
|
| 210 |
"cell_type": "code",
|
| 211 |
-
"execution_count":
|
| 212 |
"id": "dab4c6f3-ef67-4d90-b3d5-e290c5d1b6f4",
|
| 213 |
"metadata": {},
|
| 214 |
"outputs": [],
|
|
@@ -218,7 +214,7 @@
|
|
| 218 |
},
|
| 219 |
{
|
| 220 |
"cell_type": "code",
|
| 221 |
-
"execution_count":
|
| 222 |
"id": "6a764113-ad7e-4674-aa57-ebbf405902a8",
|
| 223 |
"metadata": {},
|
| 224 |
"outputs": [],
|
|
@@ -236,7 +232,7 @@
|
|
| 236 |
},
|
| 237 |
{
|
| 238 |
"cell_type": "code",
|
| 239 |
-
"execution_count":
|
| 240 |
"id": "e492ed4a-23a3-47d6-8b50-51fb48b3aa05",
|
| 241 |
"metadata": {},
|
| 242 |
"outputs": [],
|
|
@@ -246,7 +242,7 @@
|
|
| 246 |
},
|
| 247 |
{
|
| 248 |
"cell_type": "code",
|
| 249 |
-
"execution_count":
|
| 250 |
"id": "cbd11b89-9b83-4f08-bb30-160f750f2ffb",
|
| 251 |
"metadata": {},
|
| 252 |
"outputs": [],
|
|
@@ -256,18 +252,10 @@
|
|
| 256 |
},
|
| 257 |
{
|
| 258 |
"cell_type": "code",
|
| 259 |
-
"execution_count":
|
| 260 |
"id": "d3bd848d-9985-4a3d-bdc4-ec340cc69ef3",
|
| 261 |
"metadata": {},
|
| 262 |
-
"outputs": [
|
| 263 |
-
{
|
| 264 |
-
"name": "stdout",
|
| 265 |
-
"output_type": "stream",
|
| 266 |
-
"text": [
|
| 267 |
-
"Indexing time: 2.3 mins\n"
|
| 268 |
-
]
|
| 269 |
-
}
|
| 270 |
-
],
|
| 271 |
"source": [
|
| 272 |
"indexing_cost = time.time() - start_time\n",
|
| 273 |
"indexing_cost = indexing_cost / 60\n",
|
|
@@ -276,7 +264,7 @@
|
|
| 276 |
},
|
| 277 |
{
|
| 278 |
"cell_type": "code",
|
| 279 |
-
"execution_count":
|
| 280 |
"id": "f16cca33-71fb-437d-a033-671b9fd44054",
|
| 281 |
"metadata": {},
|
| 282 |
"outputs": [],
|
|
@@ -286,28 +274,28 @@
|
|
| 286 |
},
|
| 287 |
{
|
| 288 |
"cell_type": "code",
|
| 289 |
-
"execution_count":
|
| 290 |
"id": "3290e870-41d7-49c4-9c4f-cb16bd1f469e",
|
| 291 |
"metadata": {
|
| 292 |
"scrolled": true
|
| 293 |
},
|
| 294 |
-
"outputs": [
|
| 295 |
-
{
|
| 296 |
-
"data": {
|
| 297 |
-
"text/plain": [
|
| 298 |
-
"Response(response='Context information is below.\\n---------------------\\nfile_path: ../raw_documents/answers_temp/answers_050.txt\\n\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nAnswer: The answer is \"Individual Savings\".\\n\\nfile_path: ../raw_documents/qna_temp/qna_050.txt\\n\\nC1/5\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nA. The 3’s M. That is Medisave, Medishield, Medifund.\\nB. Means Testing and Casemix.\\nC. Individual Savings.\\nD. Tax based subsidies and government subvention.\\nAnswer: C. The answer is \"Individual Savings\".\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: Healthcare System in Singapore consists of?\\nAnswer: ', source_nodes=[NodeWithScore(node=TextNode(id_='536fef67-6a3f-4054-a94a-cc9143599510', embedding=None, metadata={'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='2b0f7dad-c532-4abd-8c42-f53383a4fc76', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='5b1d1dc729a663e4ccfacc0f18adf0f6644a2a7d2991490fd962d1550c83f2ff'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='6d93c092-b4cc-4b5b-b379-080d777d3908', node_type=<ObjectType.TEXT: '1'>, metadata={'file_path': '../raw_documents/answers_temp/answers_044.txt', 'file_name': 'answers_044.txt', 'file_type': 'text/plain', 'file_size': 164, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='caeb59043b8daa56ed472941882947570abff951f64aa0498672aba5921fac1d'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='859a9958-6f5d-4581-95d0-39edfc950ef5', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='8416454b2fbad3e6122c5151d2b3d1eadf0afde3514ba09374c71e96baf712bc')}, text='Question: The fundamental principle of Singapore healthcare financing is ____________.\\nAnswer: The answer is \"Individual Savings\".', start_char_idx=0, end_char_idx=130, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=0.4159636550867191), NodeWithScore(node=TextNode(id_='472000ae-a0aa-4464-a200-72fe67a3fbde', embedding=None, metadata={'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='506fb715-d3b0-4ca7-b7ca-011a1e1a1f0d', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='7461ffa12ff6729003131976b82995b7254ab10f8dc7d79c65988ec9e3b7b631'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='d8232b90-d641-4966-b98f-4ca0821db773', node_type=<ObjectType.TEXT: '1'>, metadata={'file_path': '../raw_documents/qna_temp/qna_044.txt', 'file_name': 'qna_044.txt', 'file_type': 'text/plain', 'file_size': 383, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='cbeb00c29c6130548466697a862fee43ab2be92d84158cc0b69c2f5c7bbe68b1'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='e772e623-cf91-41cd-a516-50acb894eb54', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='a7583b0fd46f98d0118c712632277d81f417b779f8bcc100ab2558dae6317cde')}, text='C1/5\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nA. The 3’s M. That is Medisave, Medishield, Medifund.\\nB. Means Testing and Casemix.\\nC. Individual Savings.\\nD. Tax based subsidies and government subvention.\\nAnswer: C. The answer is \"Individual Savings\".', start_char_idx=0, end_char_idx=295, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=0.4126648577998099)], metadata={'536fef67-6a3f-4054-a94a-cc9143599510': {'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, '472000ae-a0aa-4464-a200-72fe67a3fbde': {'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}})"
|
| 299 |
-
]
|
| 300 |
-
},
|
| 301 |
-
"execution_count": 20,
|
| 302 |
-
"metadata": {},
|
| 303 |
-
"output_type": "execute_result"
|
| 304 |
-
}
|
| 305 |
-
],
|
| 306 |
"source": [
|
| 307 |
"response = vector_query_engine.query(\"Healthcare System in Singapore consists of?\")\n",
|
| 308 |
"response"
|
| 309 |
]
|
| 310 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
{
|
| 312 |
"cell_type": "code",
|
| 313 |
"execution_count": null,
|
|
@@ -318,7 +306,7 @@
|
|
| 318 |
},
|
| 319 |
{
|
| 320 |
"cell_type": "code",
|
| 321 |
-
"execution_count":
|
| 322 |
"id": "1bb75b04-6a62-43a4-8728-d2e52e49f1c0",
|
| 323 |
"metadata": {},
|
| 324 |
"outputs": [],
|
|
@@ -329,7 +317,7 @@
|
|
| 329 |
},
|
| 330 |
{
|
| 331 |
"cell_type": "code",
|
| 332 |
-
"execution_count":
|
| 333 |
"id": "0ed920fb-6456-49ac-8b63-08bd86b5b39c",
|
| 334 |
"metadata": {},
|
| 335 |
"outputs": [],
|
|
@@ -364,7 +352,7 @@
|
|
| 364 |
},
|
| 365 |
{
|
| 366 |
"cell_type": "code",
|
| 367 |
-
"execution_count":
|
| 368 |
"id": "c1a42c35-5f57-423c-8fb7-7d18b3b466b5",
|
| 369 |
"metadata": {},
|
| 370 |
"outputs": [],
|
|
@@ -381,7 +369,14 @@
|
|
| 381 |
"from llama_index.llms.openai import OpenAI\n",
|
| 382 |
"from llama_index.core.memory import ChatMemoryBuffer\n",
|
| 383 |
"\n",
|
| 384 |
-
"import time"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 385 |
]
|
| 386 |
},
|
| 387 |
{
|
|
@@ -394,7 +389,7 @@
|
|
| 394 |
},
|
| 395 |
{
|
| 396 |
"cell_type": "code",
|
| 397 |
-
"execution_count":
|
| 398 |
"id": "d38dc953-b923-4128-86a1-c8c6f69af0ed",
|
| 399 |
"metadata": {},
|
| 400 |
"outputs": [],
|
|
@@ -404,7 +399,7 @@
|
|
| 404 |
},
|
| 405 |
{
|
| 406 |
"cell_type": "code",
|
| 407 |
-
"execution_count":
|
| 408 |
"id": "4c83c613-2cfc-4871-9d07-c82f77a3bd5e",
|
| 409 |
"metadata": {},
|
| 410 |
"outputs": [],
|
|
@@ -414,7 +409,7 @@
|
|
| 414 |
},
|
| 415 |
{
|
| 416 |
"cell_type": "code",
|
| 417 |
-
"execution_count":
|
| 418 |
"id": "0583e9b0-d977-488c-8331-46dfa749924c",
|
| 419 |
"metadata": {},
|
| 420 |
"outputs": [],
|
|
@@ -433,17 +428,17 @@
|
|
| 433 |
},
|
| 434 |
{
|
| 435 |
"cell_type": "code",
|
| 436 |
-
"execution_count":
|
| 437 |
"id": "2159a2b6-494b-41b9-ac54-dd342bfb74ba",
|
| 438 |
"metadata": {},
|
| 439 |
"outputs": [],
|
| 440 |
"source": [
|
| 441 |
-
"db = chromadb.PersistentClient(path=\"../models/
|
| 442 |
]
|
| 443 |
},
|
| 444 |
{
|
| 445 |
"cell_type": "code",
|
| 446 |
-
"execution_count":
|
| 447 |
"id": "1b385644-b46e-4d13-88fa-9f4af39db405",
|
| 448 |
"metadata": {},
|
| 449 |
"outputs": [],
|
|
@@ -453,7 +448,7 @@
|
|
| 453 |
},
|
| 454 |
{
|
| 455 |
"cell_type": "code",
|
| 456 |
-
"execution_count":
|
| 457 |
"id": "93cb53d1-6b8c-4b2d-a839-53501c0d54b2",
|
| 458 |
"metadata": {},
|
| 459 |
"outputs": [],
|
|
@@ -465,7 +460,7 @@
|
|
| 465 |
},
|
| 466 |
{
|
| 467 |
"cell_type": "code",
|
| 468 |
-
"execution_count":
|
| 469 |
"id": "c40d59e1-6d42-41f0-8c9b-70aa026093ae",
|
| 470 |
"metadata": {},
|
| 471 |
"outputs": [],
|
|
@@ -487,20 +482,7 @@
|
|
| 487 |
},
|
| 488 |
{
|
| 489 |
"cell_type": "code",
|
| 490 |
-
"execution_count":
|
| 491 |
-
"id": "1a506940-c2b4-4d14-ad93-fd451331c582",
|
| 492 |
-
"metadata": {},
|
| 493 |
-
"outputs": [],
|
| 494 |
-
"source": [
|
| 495 |
-
"system_content = (\"You are a helpful study assistant. \"\n",
|
| 496 |
-
" \"You do not respond as 'User' or pretend to be 'User'. \"\n",
|
| 497 |
-
" \"You only respond once as 'Assistant'.\"\n",
|
| 498 |
-
")"
|
| 499 |
-
]
|
| 500 |
-
},
|
| 501 |
-
{
|
| 502 |
-
"cell_type": "code",
|
| 503 |
-
"execution_count": 10,
|
| 504 |
"id": "3f592848-8536-4b4d-b34a-adc32d043432",
|
| 505 |
"metadata": {},
|
| 506 |
"outputs": [],
|
|
@@ -510,7 +492,7 @@
|
|
| 510 |
},
|
| 511 |
{
|
| 512 |
"cell_type": "code",
|
| 513 |
-
"execution_count":
|
| 514 |
"id": "6c7df81a-fd2f-42bf-b09c-46d7750f7252",
|
| 515 |
"metadata": {},
|
| 516 |
"outputs": [],
|
|
@@ -524,7 +506,7 @@
|
|
| 524 |
},
|
| 525 |
{
|
| 526 |
"cell_type": "code",
|
| 527 |
-
"execution_count":
|
| 528 |
"id": "c3106dff-dd6f-47a9-9454-1e61775e7539",
|
| 529 |
"metadata": {},
|
| 530 |
"outputs": [],
|
|
@@ -532,7 +514,7 @@
|
|
| 532 |
"hi_engine = index.as_query_engine(\n",
|
| 533 |
" memory=memory,\n",
|
| 534 |
" system_prompt=system_content,\n",
|
| 535 |
-
" similarity_top_k=
|
| 536 |
" streaming=True\n",
|
| 537 |
")"
|
| 538 |
]
|
|
@@ -547,7 +529,7 @@
|
|
| 547 |
},
|
| 548 |
{
|
| 549 |
"cell_type": "code",
|
| 550 |
-
"execution_count":
|
| 551 |
"id": "434f0caf-8b1f-40c6-b9ec-b039cd1ca612",
|
| 552 |
"metadata": {},
|
| 553 |
"outputs": [],
|
|
@@ -563,21 +545,14 @@
|
|
| 563 |
},
|
| 564 |
{
|
| 565 |
"cell_type": "code",
|
| 566 |
-
"execution_count":
|
| 567 |
"id": "a1c83dff-50d1-47b1-b7e9-4fc5cd08e1e8",
|
| 568 |
"metadata": {},
|
| 569 |
-
"outputs": [
|
| 570 |
-
{
|
| 571 |
-
"name": "stdout",
|
| 572 |
-
"output_type": "stream",
|
| 573 |
-
"text": [
|
| 574 |
-
"D. To provide for the care of employees\n"
|
| 575 |
-
]
|
| 576 |
-
}
|
| 577 |
-
],
|
| 578 |
"source": [
|
| 579 |
-
"
|
| 580 |
-
"
|
|
|
|
| 581 |
]
|
| 582 |
},
|
| 583 |
{
|
|
@@ -591,39 +566,34 @@
|
|
| 591 |
{
|
| 592 |
"cell_type": "code",
|
| 593 |
"execution_count": null,
|
| 594 |
-
"id": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 595 |
"metadata": {},
|
| 596 |
"outputs": [],
|
| 597 |
"source": []
|
| 598 |
},
|
| 599 |
{
|
| 600 |
"cell_type": "code",
|
| 601 |
-
"execution_count":
|
| 602 |
-
"id": "
|
| 603 |
-
"metadata": {},
|
| 604 |
-
"outputs": [
|
| 605 |
-
|
| 606 |
-
"name": "stderr",
|
| 607 |
-
"output_type": "stream",
|
| 608 |
-
"text": [
|
| 609 |
-
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
| 610 |
-
"To disable this warning, you can either:\n",
|
| 611 |
-
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
| 612 |
-
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
|
| 613 |
-
]
|
| 614 |
-
},
|
| 615 |
-
{
|
| 616 |
-
"name": "stdout",
|
| 617 |
-
"output_type": "stream",
|
| 618 |
-
"text": [
|
| 619 |
-
"The correct answer is \"Deductibles apply for all treatments\".\n"
|
| 620 |
-
]
|
| 621 |
-
}
|
| 622 |
-
],
|
| 623 |
-
"source": [
|
| 624 |
-
"res = chat_engine.chat(prompt)\n",
|
| 625 |
-
"print(res.response)"
|
| 626 |
-
]
|
| 627 |
},
|
| 628 |
{
|
| 629 |
"cell_type": "code",
|
|
|
|
| 10 |
},
|
| 11 |
{
|
| 12 |
"cell_type": "code",
|
| 13 |
+
"execution_count": null,
|
| 14 |
"id": "7de9c591-5a77-4bbe-80f1-4897e15f0b97",
|
| 15 |
"metadata": {},
|
| 16 |
"outputs": [],
|
|
|
|
| 33 |
"import nest_asyncio\n",
|
| 34 |
"nest_asyncio.apply()\n",
|
| 35 |
"\n",
|
| 36 |
+
"import time\n",
|
| 37 |
+
"import PyPDF2"
|
| 38 |
]
|
| 39 |
},
|
| 40 |
{
|
| 41 |
"cell_type": "code",
|
| 42 |
+
"execution_count": null,
|
| 43 |
"id": "978152ce-4d87-44b5-b521-dbaff60b32b0",
|
| 44 |
"metadata": {},
|
| 45 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
"source": [
|
| 47 |
"split_content(filepath=\"../raw_documents/answers.txt\", \n",
|
| 48 |
" separator=\"\\n\\n\", \n",
|
|
|
|
| 55 |
},
|
| 56 |
{
|
| 57 |
"cell_type": "code",
|
| 58 |
+
"execution_count": null,
|
| 59 |
"id": "d925371b-8777-4f5b-a7f2-ec3f228ef266",
|
| 60 |
"metadata": {},
|
| 61 |
"outputs": [],
|
|
|
|
| 76 |
{
|
| 77 |
"cell_type": "code",
|
| 78 |
"execution_count": null,
|
| 79 |
+
"id": "a83b4fd8-5075-4c52-820c-a3ac7ee7f0c8",
|
| 80 |
"metadata": {},
|
| 81 |
"outputs": [],
|
| 82 |
"source": []
|
| 83 |
},
|
| 84 |
{
|
| 85 |
"cell_type": "code",
|
| 86 |
+
"execution_count": null,
|
| 87 |
"id": "3e65dff6-77b6-4be8-8857-5cecf3a035bb",
|
| 88 |
"metadata": {},
|
| 89 |
"outputs": [],
|
| 90 |
"source": [
|
| 91 |
"# load some documents\n",
|
| 92 |
+
"if False:\n",
|
| 93 |
+
" documents = SimpleDirectoryReader(input_files=[\n",
|
| 94 |
+
" \"../raw_documents/HI Chapter Summary Version 1.3.pdf\",\n",
|
| 95 |
+
" \"../raw_documents/conversation_examples.txt\",\n",
|
| 96 |
+
" \"../raw_documents/HI_Knowledge_Base.pdf\",\n",
|
| 97 |
+
" ] + answers_temp_files + qna_temp_files ).load_data()\n",
|
| 98 |
+
"else:\n",
|
| 99 |
+
" reader_summary = PyPDF2.PdfReader(\"../raw_documents/HI Chapter Summary Version 1.3.pdf\")\n",
|
| 100 |
+
" documents_summary = [ p.extract_text() for p in reader_summary.pages ]\n",
|
| 101 |
+
"\n",
|
| 102 |
+
" reader_base = PyPDF2.PdfReader(\"../raw_documents/HI_Knowledge_Base.pdf\")\n",
|
| 103 |
+
" documents_base = [ p.extract_text() for p in reader_base.pages ]\n",
|
| 104 |
+
" \n",
|
| 105 |
+
" documents_txt = SimpleDirectoryReader(input_files=[\n",
|
| 106 |
+
" \"../raw_documents/conversation_examples.txt\",\n",
|
| 107 |
+
" \"../raw_documents/qna.txt\",\n",
|
| 108 |
+
" \"../raw_documents/answers.txt\"\n",
|
| 109 |
+
" ] ).load_data()\n",
|
| 110 |
+
" documents_txt = [doc.text for doc in documents_txt]\n",
|
| 111 |
+
"\n",
|
| 112 |
+
"document = Document(text=\"\\n\\n\".join(documents_summary + documents_base + documents_txt))"
|
| 113 |
]
|
| 114 |
},
|
| 115 |
{
|
| 116 |
"cell_type": "code",
|
| 117 |
+
"execution_count": null,
|
| 118 |
+
"id": "e485f801-1829-4b50-b6b2-52803203853b",
|
| 119 |
+
"metadata": {},
|
| 120 |
+
"outputs": [],
|
| 121 |
+
"source": []
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"cell_type": "code",
|
| 125 |
+
"execution_count": null,
|
| 126 |
"id": "bd86b3f5-1dfc-4257-bd9c-86d34f02398d",
|
| 127 |
"metadata": {},
|
| 128 |
"outputs": [],
|
| 129 |
"source": [
|
| 130 |
"# initialize client, setting path to save data\n",
|
| 131 |
+
"db = chromadb.PersistentClient(path=\"../models/chroma_db_advanced_corrected\")"
|
| 132 |
]
|
| 133 |
},
|
| 134 |
{
|
| 135 |
"cell_type": "code",
|
| 136 |
+
"execution_count": null,
|
| 137 |
"id": "f568ce7b-bcbf-455c-acf1-6c2cae129fed",
|
| 138 |
"metadata": {},
|
| 139 |
"outputs": [],
|
|
|
|
| 144 |
},
|
| 145 |
{
|
| 146 |
"cell_type": "code",
|
| 147 |
+
"execution_count": null,
|
| 148 |
"id": "ed0b018e-1982-46b2-b1b4-04f5c0ce8672",
|
| 149 |
"metadata": {},
|
| 150 |
"outputs": [],
|
|
|
|
| 163 |
},
|
| 164 |
{
|
| 165 |
"cell_type": "code",
|
| 166 |
+
"execution_count": null,
|
| 167 |
"id": "0946b6ce-96ab-44de-ad75-e424a8429f67",
|
| 168 |
"metadata": {},
|
| 169 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
"source": [
|
| 171 |
"Settings.llm = None\n",
|
| 172 |
"Settings.chunk_size = 1024\n",
|
|
|
|
| 176 |
},
|
| 177 |
{
|
| 178 |
"cell_type": "code",
|
| 179 |
+
"execution_count": null,
|
| 180 |
"id": "b8c73a2c-1129-406a-8046-085afcaf9cbb",
|
| 181 |
"metadata": {},
|
| 182 |
"outputs": [],
|
| 183 |
"source": [
|
| 184 |
+
"nodes = Settings.node_parser.get_nodes_from_documents([document])"
|
| 185 |
]
|
| 186 |
},
|
| 187 |
{
|
| 188 |
"cell_type": "code",
|
| 189 |
+
"execution_count": null,
|
| 190 |
"id": "75f1c76f-d3e5-4b69-818c-98865adb1457",
|
| 191 |
"metadata": {},
|
| 192 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
"source": [
|
| 194 |
"len(nodes)"
|
| 195 |
]
|
|
|
|
| 204 |
},
|
| 205 |
{
|
| 206 |
"cell_type": "code",
|
| 207 |
+
"execution_count": null,
|
| 208 |
"id": "dab4c6f3-ef67-4d90-b3d5-e290c5d1b6f4",
|
| 209 |
"metadata": {},
|
| 210 |
"outputs": [],
|
|
|
|
| 214 |
},
|
| 215 |
{
|
| 216 |
"cell_type": "code",
|
| 217 |
+
"execution_count": null,
|
| 218 |
"id": "6a764113-ad7e-4674-aa57-ebbf405902a8",
|
| 219 |
"metadata": {},
|
| 220 |
"outputs": [],
|
|
|
|
| 232 |
},
|
| 233 |
{
|
| 234 |
"cell_type": "code",
|
| 235 |
+
"execution_count": null,
|
| 236 |
"id": "e492ed4a-23a3-47d6-8b50-51fb48b3aa05",
|
| 237 |
"metadata": {},
|
| 238 |
"outputs": [],
|
|
|
|
| 242 |
},
|
| 243 |
{
|
| 244 |
"cell_type": "code",
|
| 245 |
+
"execution_count": null,
|
| 246 |
"id": "cbd11b89-9b83-4f08-bb30-160f750f2ffb",
|
| 247 |
"metadata": {},
|
| 248 |
"outputs": [],
|
|
|
|
| 252 |
},
|
| 253 |
{
|
| 254 |
"cell_type": "code",
|
| 255 |
+
"execution_count": null,
|
| 256 |
"id": "d3bd848d-9985-4a3d-bdc4-ec340cc69ef3",
|
| 257 |
"metadata": {},
|
| 258 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
"source": [
|
| 260 |
"indexing_cost = time.time() - start_time\n",
|
| 261 |
"indexing_cost = indexing_cost / 60\n",
|
|
|
|
| 264 |
},
|
| 265 |
{
|
| 266 |
"cell_type": "code",
|
| 267 |
+
"execution_count": null,
|
| 268 |
"id": "f16cca33-71fb-437d-a033-671b9fd44054",
|
| 269 |
"metadata": {},
|
| 270 |
"outputs": [],
|
|
|
|
| 274 |
},
|
| 275 |
{
|
| 276 |
"cell_type": "code",
|
| 277 |
+
"execution_count": null,
|
| 278 |
"id": "3290e870-41d7-49c4-9c4f-cb16bd1f469e",
|
| 279 |
"metadata": {
|
| 280 |
"scrolled": true
|
| 281 |
},
|
| 282 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
"source": [
|
| 284 |
"response = vector_query_engine.query(\"Healthcare System in Singapore consists of?\")\n",
|
| 285 |
"response"
|
| 286 |
]
|
| 287 |
},
|
| 288 |
+
{
|
| 289 |
+
"cell_type": "code",
|
| 290 |
+
"execution_count": null,
|
| 291 |
+
"id": "d83e2938-61fa-4d02-920d-0ae88a437abc",
|
| 292 |
+
"metadata": {},
|
| 293 |
+
"outputs": [],
|
| 294 |
+
"source": [
|
| 295 |
+
"response = vector_query_engine.query(\"what is integrated shield plan\")\n",
|
| 296 |
+
"response"
|
| 297 |
+
]
|
| 298 |
+
},
|
| 299 |
{
|
| 300 |
"cell_type": "code",
|
| 301 |
"execution_count": null,
|
|
|
|
| 306 |
},
|
| 307 |
{
|
| 308 |
"cell_type": "code",
|
| 309 |
+
"execution_count": null,
|
| 310 |
"id": "1bb75b04-6a62-43a4-8728-d2e52e49f1c0",
|
| 311 |
"metadata": {},
|
| 312 |
"outputs": [],
|
|
|
|
| 317 |
},
|
| 318 |
{
|
| 319 |
"cell_type": "code",
|
| 320 |
+
"execution_count": null,
|
| 321 |
"id": "0ed920fb-6456-49ac-8b63-08bd86b5b39c",
|
| 322 |
"metadata": {},
|
| 323 |
"outputs": [],
|
|
|
|
| 352 |
},
|
| 353 |
{
|
| 354 |
"cell_type": "code",
|
| 355 |
+
"execution_count": null,
|
| 356 |
"id": "c1a42c35-5f57-423c-8fb7-7d18b3b466b5",
|
| 357 |
"metadata": {},
|
| 358 |
"outputs": [],
|
|
|
|
| 369 |
"from llama_index.llms.openai import OpenAI\n",
|
| 370 |
"from llama_index.core.memory import ChatMemoryBuffer\n",
|
| 371 |
"\n",
|
| 372 |
+
"import time\n",
|
| 373 |
+
"\n",
|
| 374 |
+
"from prompt_engineering import (\n",
|
| 375 |
+
" system_content, \n",
|
| 376 |
+
" textbook_content, \n",
|
| 377 |
+
" winnie_the_pooh_prompt, \n",
|
| 378 |
+
" introduction_line\n",
|
| 379 |
+
")"
|
| 380 |
]
|
| 381 |
},
|
| 382 |
{
|
|
|
|
| 389 |
},
|
| 390 |
{
|
| 391 |
"cell_type": "code",
|
| 392 |
+
"execution_count": null,
|
| 393 |
"id": "d38dc953-b923-4128-86a1-c8c6f69af0ed",
|
| 394 |
"metadata": {},
|
| 395 |
"outputs": [],
|
|
|
|
| 399 |
},
|
| 400 |
{
|
| 401 |
"cell_type": "code",
|
| 402 |
+
"execution_count": null,
|
| 403 |
"id": "4c83c613-2cfc-4871-9d07-c82f77a3bd5e",
|
| 404 |
"metadata": {},
|
| 405 |
"outputs": [],
|
|
|
|
| 409 |
},
|
| 410 |
{
|
| 411 |
"cell_type": "code",
|
| 412 |
+
"execution_count": null,
|
| 413 |
"id": "0583e9b0-d977-488c-8331-46dfa749924c",
|
| 414 |
"metadata": {},
|
| 415 |
"outputs": [],
|
|
|
|
| 428 |
},
|
| 429 |
{
|
| 430 |
"cell_type": "code",
|
| 431 |
+
"execution_count": null,
|
| 432 |
"id": "2159a2b6-494b-41b9-ac54-dd342bfb74ba",
|
| 433 |
"metadata": {},
|
| 434 |
"outputs": [],
|
| 435 |
"source": [
|
| 436 |
+
"db = chromadb.PersistentClient(path=\"../models/chroma_db_advanced_corrected\")"
|
| 437 |
]
|
| 438 |
},
|
| 439 |
{
|
| 440 |
"cell_type": "code",
|
| 441 |
+
"execution_count": null,
|
| 442 |
"id": "1b385644-b46e-4d13-88fa-9f4af39db405",
|
| 443 |
"metadata": {},
|
| 444 |
"outputs": [],
|
|
|
|
| 448 |
},
|
| 449 |
{
|
| 450 |
"cell_type": "code",
|
| 451 |
+
"execution_count": null,
|
| 452 |
"id": "93cb53d1-6b8c-4b2d-a839-53501c0d54b2",
|
| 453 |
"metadata": {},
|
| 454 |
"outputs": [],
|
|
|
|
| 460 |
},
|
| 461 |
{
|
| 462 |
"cell_type": "code",
|
| 463 |
+
"execution_count": null,
|
| 464 |
"id": "c40d59e1-6d42-41f0-8c9b-70aa026093ae",
|
| 465 |
"metadata": {},
|
| 466 |
"outputs": [],
|
|
|
|
| 482 |
},
|
| 483 |
{
|
| 484 |
"cell_type": "code",
|
| 485 |
+
"execution_count": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
"id": "3f592848-8536-4b4d-b34a-adc32d043432",
|
| 487 |
"metadata": {},
|
| 488 |
"outputs": [],
|
|
|
|
| 492 |
},
|
| 493 |
{
|
| 494 |
"cell_type": "code",
|
| 495 |
+
"execution_count": null,
|
| 496 |
"id": "6c7df81a-fd2f-42bf-b09c-46d7750f7252",
|
| 497 |
"metadata": {},
|
| 498 |
"outputs": [],
|
|
|
|
| 506 |
},
|
| 507 |
{
|
| 508 |
"cell_type": "code",
|
| 509 |
+
"execution_count": null,
|
| 510 |
"id": "c3106dff-dd6f-47a9-9454-1e61775e7539",
|
| 511 |
"metadata": {},
|
| 512 |
"outputs": [],
|
|
|
|
| 514 |
"hi_engine = index.as_query_engine(\n",
|
| 515 |
" memory=memory,\n",
|
| 516 |
" system_prompt=system_content,\n",
|
| 517 |
+
" similarity_top_k=20,\n",
|
| 518 |
" streaming=True\n",
|
| 519 |
")"
|
| 520 |
]
|
|
|
|
| 529 |
},
|
| 530 |
{
|
| 531 |
"cell_type": "code",
|
| 532 |
+
"execution_count": null,
|
| 533 |
"id": "434f0caf-8b1f-40c6-b9ec-b039cd1ca612",
|
| 534 |
"metadata": {},
|
| 535 |
"outputs": [],
|
|
|
|
| 545 |
},
|
| 546 |
{
|
| 547 |
"cell_type": "code",
|
| 548 |
+
"execution_count": null,
|
| 549 |
"id": "a1c83dff-50d1-47b1-b7e9-4fc5cd08e1e8",
|
| 550 |
"metadata": {},
|
| 551 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 552 |
"source": [
|
| 553 |
+
"response = hi_engine.query(prompt)\n",
|
| 554 |
+
"for res in response.response_gen:\n",
|
| 555 |
+
" print(res, end=\"\")"
|
| 556 |
]
|
| 557 |
},
|
| 558 |
{
|
|
|
|
| 566 |
{
|
| 567 |
"cell_type": "code",
|
| 568 |
"execution_count": null,
|
| 569 |
+
"id": "91821a22-c1c4-46a6-90f0-c00651afb0f6",
|
| 570 |
+
"metadata": {},
|
| 571 |
+
"outputs": [],
|
| 572 |
+
"source": [
|
| 573 |
+
"# query_string = \"tell me more about integrated shield plans\"\n",
|
| 574 |
+
"# query_string = \"how to use CPF\"\n",
|
| 575 |
+
"query_string = \"what is MediSave\"\n",
|
| 576 |
+
"\n",
|
| 577 |
+
"response = hi_engine.query(query_string)\n",
|
| 578 |
+
"for res in response.response_gen:\n",
|
| 579 |
+
" print(res, end=\"\")"
|
| 580 |
+
]
|
| 581 |
+
},
|
| 582 |
+
{
|
| 583 |
+
"cell_type": "code",
|
| 584 |
+
"execution_count": null,
|
| 585 |
+
"id": "07969feb-2667-4d7d-a769-953082138988",
|
| 586 |
"metadata": {},
|
| 587 |
"outputs": [],
|
| 588 |
"source": []
|
| 589 |
},
|
| 590 |
{
|
| 591 |
"cell_type": "code",
|
| 592 |
+
"execution_count": null,
|
| 593 |
+
"id": "ec53dfcf-d4c0-4d10-a24e-be2004a83656",
|
| 594 |
+
"metadata": {},
|
| 595 |
+
"outputs": [],
|
| 596 |
+
"source": []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 597 |
},
|
| 598 |
{
|
| 599 |
"cell_type": "code",
|
notebooks/007_test_hi_content_engine.ipynb
CHANGED
|
@@ -34,6 +34,12 @@
|
|
| 34 |
"\n",
|
| 35 |
"from vision_api import get_transcribed_text\n",
|
| 36 |
"from qna_prompting import get_qna_question_tool, evaluate_qna_answer_tool\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
"\n",
|
| 38 |
"import nest_asyncio\n",
|
| 39 |
"nest_asyncio.apply()"
|
|
@@ -106,18 +112,20 @@
|
|
| 106 |
"\n",
|
| 107 |
" index = VectorStoreIndex(nodes, storage_context=storage_context)\n",
|
| 108 |
" \n",
|
| 109 |
-
" memory = ChatMemoryBuffer.from_defaults(token_limit=
|
| 110 |
" hi_content_engine = index.as_query_engine(\n",
|
| 111 |
" memory=memory,\n",
|
| 112 |
" system_prompt=system_content,\n",
|
| 113 |
-
" similarity_top_k=
|
|
|
|
| 114 |
" streaming=True\n",
|
| 115 |
" )\n",
|
| 116 |
" hi_textbook_query_description = \"\"\"\n",
|
| 117 |
-
" Use this tool to extract content from
|
|
|
|
| 118 |
" that has 15 chapters in total. When user wants to learn more about a \n",
|
| 119 |
" particular chapter, this tool will help to assist user to get better\n",
|
| 120 |
-
" understanding of the content of the textbook
|
| 121 |
" \"\"\"\n",
|
| 122 |
" \n",
|
| 123 |
" hi_query_tool = QueryEngineTool.from_defaults(\n",
|
|
@@ -195,32 +203,10 @@
|
|
| 195 |
"input_files = [\"./raw_documents/HI Chapter Summary Version 1.3.pdf\",\n",
|
| 196 |
" \"./raw_documents/qna.txt\"]\n",
|
| 197 |
"embedding_model = \"BAAI/bge-small-en-v1.5\"\n",
|
| 198 |
-
"persisted_vector_db = \"../models/
|
| 199 |
-
"
|
| 200 |
-
"
|
| 201 |
-
"
|
| 202 |
-
" \"You do not respond as 'User' or pretend to be 'User'. \"\n",
|
| 203 |
-
" \"You only respond once as 'Assistant'.\"\n",
|
| 204 |
-
")\n",
|
| 205 |
-
"textbook_content = (\n",
|
| 206 |
-
" \"The content of the textbook `Health Insurance 7th Edition` are as follows,\"\n",
|
| 207 |
-
" \"- Chapter 1: Overview Of Healthcare Environment In Singapore\"\n",
|
| 208 |
-
" \"- Chapter 2: Medical Expense Insurance\"\n",
|
| 209 |
-
" \"- Chapter 3: Group Medical Expense Insurance\"\n",
|
| 210 |
-
" \"- Chapter 4: Disability Income Insurance\"\n",
|
| 211 |
-
" \"- Chapter 5: Long-Term Care Insurance \"\n",
|
| 212 |
-
" \"- Chapter 6: Critical Illness Insurance\"\n",
|
| 213 |
-
" \"- Chapter 7: Other Types Of Health Insurance\"\n",
|
| 214 |
-
" \"- Chapter 8: Managed Healthcare\"\n",
|
| 215 |
-
" \"- Chapter 9: Part I Healthcare Financing\"\n",
|
| 216 |
-
" \"- Chapter 9: Part II Healthcare Financing\"\n",
|
| 217 |
-
" \"- Chapter 10: Common Policy Provisions\"\n",
|
| 218 |
-
" \"- Chapter 11: Health Insurance Pricing\"\n",
|
| 219 |
-
" \"- Chapter 12: Health Insurance Underwriting\"\n",
|
| 220 |
-
" \"- Chapter 13: Notice No: MAS 120 Disclosure And Advisory Process - Requirements For Accident And Health Insurance Products\"\n",
|
| 221 |
-
" \"- Chapter 14: Financial Needs Analysis\"\n",
|
| 222 |
-
" \"- Chapter 15: Case Studies\"\n",
|
| 223 |
-
")"
|
| 224 |
]
|
| 225 |
},
|
| 226 |
{
|
|
@@ -292,6 +278,14 @@
|
|
| 292 |
")"
|
| 293 |
]
|
| 294 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
{
|
| 296 |
"cell_type": "code",
|
| 297 |
"execution_count": null,
|
|
@@ -338,36 +332,39 @@
|
|
| 338 |
{
|
| 339 |
"cell_type": "code",
|
| 340 |
"execution_count": null,
|
| 341 |
-
"id": "
|
| 342 |
"metadata": {},
|
| 343 |
"outputs": [],
|
| 344 |
"source": [
|
| 345 |
-
"
|
|
|
|
|
|
|
| 346 |
]
|
| 347 |
},
|
| 348 |
{
|
| 349 |
"cell_type": "code",
|
| 350 |
"execution_count": null,
|
| 351 |
-
"id": "
|
| 352 |
"metadata": {},
|
| 353 |
"outputs": [],
|
| 354 |
-
"source": [
|
| 355 |
-
"for res in response.response_gen:\n",
|
| 356 |
-
" print(res, end=\"\")"
|
| 357 |
-
]
|
| 358 |
},
|
| 359 |
{
|
| 360 |
"cell_type": "code",
|
| 361 |
"execution_count": null,
|
| 362 |
-
"id": "
|
| 363 |
"metadata": {},
|
| 364 |
"outputs": [],
|
| 365 |
-
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
},
|
| 367 |
{
|
| 368 |
"cell_type": "code",
|
| 369 |
"execution_count": null,
|
| 370 |
-
"id": "
|
| 371 |
"metadata": {},
|
| 372 |
"outputs": [],
|
| 373 |
"source": []
|
|
@@ -379,7 +376,9 @@
|
|
| 379 |
"metadata": {},
|
| 380 |
"outputs": [],
|
| 381 |
"source": [
|
| 382 |
-
"response = agent.stream_chat(
|
|
|
|
|
|
|
| 383 |
]
|
| 384 |
},
|
| 385 |
{
|
|
@@ -388,10 +387,7 @@
|
|
| 388 |
"id": "eff8bb8d-a2d1-428a-9c3d-193389378288",
|
| 389 |
"metadata": {},
|
| 390 |
"outputs": [],
|
| 391 |
-
"source": [
|
| 392 |
-
"for res in response.response_gen:\n",
|
| 393 |
-
" print(res, end=\"\")"
|
| 394 |
-
]
|
| 395 |
},
|
| 396 |
{
|
| 397 |
"cell_type": "code",
|
|
|
|
| 34 |
"\n",
|
| 35 |
"from vision_api import get_transcribed_text\n",
|
| 36 |
"from qna_prompting import get_qna_question_tool, evaluate_qna_answer_tool\n",
|
| 37 |
+
"from prompt_engineering import (\n",
|
| 38 |
+
" system_content, \n",
|
| 39 |
+
" textbook_content, \n",
|
| 40 |
+
" winnie_the_pooh_prompt, \n",
|
| 41 |
+
" introduction_line\n",
|
| 42 |
+
")\n",
|
| 43 |
"\n",
|
| 44 |
"import nest_asyncio\n",
|
| 45 |
"nest_asyncio.apply()"
|
|
|
|
| 112 |
"\n",
|
| 113 |
" index = VectorStoreIndex(nodes, storage_context=storage_context)\n",
|
| 114 |
" \n",
|
| 115 |
+
" memory = ChatMemoryBuffer.from_defaults(token_limit=100_000)\n",
|
| 116 |
" hi_content_engine = index.as_query_engine(\n",
|
| 117 |
" memory=memory,\n",
|
| 118 |
" system_prompt=system_content,\n",
|
| 119 |
+
" similarity_top_k=10,\n",
|
| 120 |
+
" verbose=True,\n",
|
| 121 |
" streaming=True\n",
|
| 122 |
" )\n",
|
| 123 |
" hi_textbook_query_description = \"\"\"\n",
|
| 124 |
+
" Use this tool to extract content from the query engine,\n",
|
| 125 |
+
" which is built by ingesting textbook content from `Health Insurance 7th Edition`,\n",
|
| 126 |
" that has 15 chapters in total. When user wants to learn more about a \n",
|
| 127 |
" particular chapter, this tool will help to assist user to get better\n",
|
| 128 |
+
" understanding of the content of the textbook. \n",
|
| 129 |
" \"\"\"\n",
|
| 130 |
" \n",
|
| 131 |
" hi_query_tool = QueryEngineTool.from_defaults(\n",
|
|
|
|
| 203 |
"input_files = [\"./raw_documents/HI Chapter Summary Version 1.3.pdf\",\n",
|
| 204 |
" \"./raw_documents/qna.txt\"]\n",
|
| 205 |
"embedding_model = \"BAAI/bge-small-en-v1.5\"\n",
|
| 206 |
+
"persisted_vector_db = \"../models/chroma_db_advanced_corrected\"\n",
|
| 207 |
+
"\n",
|
| 208 |
+
"# fine_tuned_path = \"local:../models/fine-tuned-embeddings\"\n",
|
| 209 |
+
"fine_tuned_path = \"local:../models/fine-tuned-embeddings-advanced\""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
]
|
| 211 |
},
|
| 212 |
{
|
|
|
|
| 278 |
")"
|
| 279 |
]
|
| 280 |
},
|
| 281 |
+
{
|
| 282 |
+
"cell_type": "code",
|
| 283 |
+
"execution_count": null,
|
| 284 |
+
"id": "a49ed30e-a631-4618-a79e-adab02114d8d",
|
| 285 |
+
"metadata": {},
|
| 286 |
+
"outputs": [],
|
| 287 |
+
"source": []
|
| 288 |
+
},
|
| 289 |
{
|
| 290 |
"cell_type": "code",
|
| 291 |
"execution_count": null,
|
|
|
|
| 332 |
{
|
| 333 |
"cell_type": "code",
|
| 334 |
"execution_count": null,
|
| 335 |
+
"id": "66c8881d-fc57-4e95-ad86-110c4818e2fe",
|
| 336 |
"metadata": {},
|
| 337 |
"outputs": [],
|
| 338 |
"source": [
|
| 339 |
+
"# query_string = \"tell me more about integrated shield plans\"\n",
|
| 340 |
+
"query_string = \"how to use CPF\"\n",
|
| 341 |
+
"# query_string = \"what is MediSave\""
|
| 342 |
]
|
| 343 |
},
|
| 344 |
{
|
| 345 |
"cell_type": "code",
|
| 346 |
"execution_count": null,
|
| 347 |
+
"id": "9cbd338b-bee5-4c06-9934-a0e27fd518d3",
|
| 348 |
"metadata": {},
|
| 349 |
"outputs": [],
|
| 350 |
+
"source": []
|
|
|
|
|
|
|
|
|
|
| 351 |
},
|
| 352 |
{
|
| 353 |
"cell_type": "code",
|
| 354 |
"execution_count": null,
|
| 355 |
+
"id": "5902ffd2-2f66-4b89-bf7f-a05e3fdeccaa",
|
| 356 |
"metadata": {},
|
| 357 |
"outputs": [],
|
| 358 |
+
"source": [
|
| 359 |
+
"response = hi_content_engine.query(query_string)\n",
|
| 360 |
+
"for res in response.response_gen:\n",
|
| 361 |
+
" print(res, end=\"\")"
|
| 362 |
+
]
|
| 363 |
},
|
| 364 |
{
|
| 365 |
"cell_type": "code",
|
| 366 |
"execution_count": null,
|
| 367 |
+
"id": "0e75453b-85c7-4e1c-8683-6df45a13cacb",
|
| 368 |
"metadata": {},
|
| 369 |
"outputs": [],
|
| 370 |
"source": []
|
|
|
|
| 376 |
"metadata": {},
|
| 377 |
"outputs": [],
|
| 378 |
"source": [
|
| 379 |
+
"response = agent.stream_chat(query_string, tool_choice=\"auto\")\n",
|
| 380 |
+
"for res in response.response_gen:\n",
|
| 381 |
+
" print(res, end=\"\")"
|
| 382 |
]
|
| 383 |
},
|
| 384 |
{
|
|
|
|
| 387 |
"id": "eff8bb8d-a2d1-428a-9c3d-193389378288",
|
| 388 |
"metadata": {},
|
| 389 |
"outputs": [],
|
| 390 |
+
"source": []
|
|
|
|
|
|
|
|
|
|
| 391 |
},
|
| 392 |
{
|
| 393 |
"cell_type": "code",
|
requirements.txt
CHANGED
|
@@ -185,6 +185,7 @@ PyMuPDF==1.23.22
|
|
| 185 |
PyMuPDFb==1.23.22
|
| 186 |
pyparsing==3.1.1
|
| 187 |
pypdf==4.0.1
|
|
|
|
| 188 |
PyPika==0.48.9
|
| 189 |
pyproject_hooks==1.0.0
|
| 190 |
python-dateutil==2.8.2
|
|
|
|
| 185 |
PyMuPDFb==1.23.22
|
| 186 |
pyparsing==3.1.1
|
| 187 |
pypdf==4.0.1
|
| 188 |
+
PyPDF2==3.0.1
|
| 189 |
PyPika==0.48.9
|
| 190 |
pyproject_hooks==1.0.0
|
| 191 |
python-dateutil==2.8.2
|