Spaces:
Sleeping
Sleeping
File size: 6,006 Bytes
5eaaba5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
{
"cells": [
{
"cell_type": "markdown",
"id": "50fa1916-c45f-4e79-aacd-8fd2956292b9",
"metadata": {},
"source": [
"# Table Question Answering Pipeline\n",
"\n",
"Experimenting with a TableQuestionAnsweringPipeline.\n",
"\n",
"This is not really a good use of the model but it kind of works."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "c4ca4235-042e-4b30-9a95-73f85c5f64ea",
"metadata": {},
"outputs": [],
"source": [
"from transformers import pipeline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "87419b8e-c1bb-42e4-b15a-e7d8c2b7367a",
"metadata": {},
"outputs": [],
"source": [
"faker_mappings = {\n",
" # Personal information\n",
" \"name\": [\n",
" \"full name\", \"name\", \"name of user\", \"person name\", \"name of person\", \n",
" \"complete name\", \"user name\", \"customer name\", \"client name\"\n",
" ],\n",
" \"first_name\": [\n",
" \"first name\", \"user's first name\", \"first name of a person\", \"person first name\", \n",
" \"given name\", \"forename\", \"christian name\", '', ''\n",
" ],\n",
" \"last_name\": [\n",
" \"last name\", \"surname\", \"family name\", \"user's last name\", \n",
" \"last name of user\", \"person's surname\", '', '', ''\n",
" ],\n",
" \"password\": [\n",
" \"user password\", \"person password\", \"member password\", \"secret password\", \n",
" \"confidential password\", \"example password\", '', '', '' # all arrays have to be equal. \n",
" ]\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1d2919a4-2312-4e8c-a1d3-66c7042b2400",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Device set to use mps:0\n"
]
}
],
"source": [
"pipe = pipeline(model=\"google/tapas-base-finetuned-wtq\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0a7ca742-273c-4530-a591-3012fe31cc09",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/anaconda3/envs/llms/lib/python3.11/site-packages/transformers/models/tapas/tokenization_tapas.py:2699: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" text = normalize_for_match(row[col_index].text)\n",
"/opt/anaconda3/envs/llms/lib/python3.11/site-packages/transformers/models/tapas/tokenization_tapas.py:1493: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" cell = row[col_index]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{'answer': 'first name', 'coordinates': [(0, 1)], 'cells': ['first name'], 'aggregator': 'NONE'}, {'answer': 'full name', 'coordinates': [(0, 0)], 'cells': ['full name'], 'aggregator': 'NONE'}, {'answer': 'secret password, confidential password', 'coordinates': [(3, 3), (4, 3)], 'cells': ['secret password', 'confidential password'], 'aggregator': 'NONE'}, {'answer': 'full name', 'coordinates': [(0, 0)], 'cells': ['full name'], 'aggregator': 'NONE'}]\n"
]
}
],
"source": [
"result = pipe(query=['user first name', 'full name', \"secret password\", \"unique id\"], table=faker_mappings)\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "803584e5-e3e1-48df-b974-712f7c5e81f5",
"metadata": {},
"outputs": [],
"source": [
"# A function to lookup the faker function names based on coordinates\n",
"def lookup_keys_from_results(result, faker_mappings):\n",
" reverse_mapping = {}\n",
" for key, values in faker_mappings.items():\n",
" for value in values:\n",
" if value:\n",
" reverse_mapping[value.lower()] = key\n",
" \n",
" coordinate_to_key = {}\n",
" for item in result:\n",
" answer = item['answer'].lower()\n",
" \n",
" if ',' in answer:\n",
" answers = [a.strip() for a in answer.split(',')]\n",
" for a in answers:\n",
" if a in reverse_mapping:\n",
" for coord in item['coordinates']:\n",
" coordinate_to_key[coord] = reverse_mapping[a]\n",
" break\n",
" else:\n",
" if answer in reverse_mapping:\n",
" for coord in item['coordinates']:\n",
" coordinate_to_key[coord] = reverse_mapping[answer]\n",
" \n",
" return coordinate_to_key"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2025a892-e3e5-4efe-8822-2cbbd13f903c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{(0, 1): 'first_name', (0, 0): 'name', (3, 3): 'password', (4, 3): 'password'}\n"
]
}
],
"source": [
"print(lookup_keys_from_results(result, faker_mappings))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|