File size: 6,006 Bytes
5eaaba5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "50fa1916-c45f-4e79-aacd-8fd2956292b9",
   "metadata": {},
   "source": [
    "# Table Question Answering Pipeline\n",
    "\n",
    "Experimenting with a TableQuestionAnsweringPipeline.\n",
    "\n",
    "This is not really a good use of the model but it kind of works."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c4ca4235-042e-4b30-9a95-73f85c5f64ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "87419b8e-c1bb-42e4-b15a-e7d8c2b7367a",
   "metadata": {},
   "outputs": [],
   "source": [
    "faker_mappings = {\n",
    "    # Personal information\n",
    "      \"name\": [\n",
    "            \"full name\", \"name\", \"name of user\", \"person name\", \"name of person\", \n",
    "            \"complete name\", \"user name\", \"customer name\", \"client name\"\n",
    "        ],\n",
    "        \"first_name\": [\n",
    "            \"first name\", \"user's first name\", \"first name of a person\", \"person first name\", \n",
    "            \"given name\", \"forename\", \"christian name\", '', ''\n",
    "        ],\n",
    "        \"last_name\": [\n",
    "            \"last name\", \"surname\", \"family name\", \"user's last name\", \n",
    "            \"last name of user\", \"person's surname\", '', '', ''\n",
    "        ],\n",
    "        \"password\": [\n",
    "            \"user password\", \"person password\", \"member password\", \"secret password\", \n",
    "            \"confidential password\", \"example password\", '', '', '' # all arrays have to be equal. \n",
    "        ]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1d2919a4-2312-4e8c-a1d3-66c7042b2400",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Device set to use mps:0\n"
     ]
    }
   ],
   "source": [
    "pipe = pipeline(model=\"google/tapas-base-finetuned-wtq\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0a7ca742-273c-4530-a591-3012fe31cc09",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/envs/llms/lib/python3.11/site-packages/transformers/models/tapas/tokenization_tapas.py:2699: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
      "  text = normalize_for_match(row[col_index].text)\n",
      "/opt/anaconda3/envs/llms/lib/python3.11/site-packages/transformers/models/tapas/tokenization_tapas.py:1493: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
      "  cell = row[col_index]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'answer': 'first name', 'coordinates': [(0, 1)], 'cells': ['first name'], 'aggregator': 'NONE'}, {'answer': 'full name', 'coordinates': [(0, 0)], 'cells': ['full name'], 'aggregator': 'NONE'}, {'answer': 'secret password, confidential password', 'coordinates': [(3, 3), (4, 3)], 'cells': ['secret password', 'confidential password'], 'aggregator': 'NONE'}, {'answer': 'full name', 'coordinates': [(0, 0)], 'cells': ['full name'], 'aggregator': 'NONE'}]\n"
     ]
    }
   ],
   "source": [
    "result = pipe(query=['user first name', 'full name', \"secret password\", \"unique id\"], table=faker_mappings)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "803584e5-e3e1-48df-b974-712f7c5e81f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# A function to lookup the faker function names based on coordinates\n",
    "def lookup_keys_from_results(result, faker_mappings):\n",
    "    reverse_mapping = {}\n",
    "    for key, values in faker_mappings.items():\n",
    "        for value in values:\n",
    "            if value:\n",
    "                reverse_mapping[value.lower()] = key\n",
    "    \n",
    "    coordinate_to_key = {}\n",
    "    for item in result:\n",
    "        answer = item['answer'].lower()\n",
    "        \n",
    "        if ',' in answer:\n",
    "            answers = [a.strip() for a in answer.split(',')]\n",
    "            for a in answers:\n",
    "                if a in reverse_mapping:\n",
    "                    for coord in item['coordinates']:\n",
    "                        coordinate_to_key[coord] = reverse_mapping[a]\n",
    "                    break\n",
    "        else:\n",
    "            if answer in reverse_mapping:\n",
    "                for coord in item['coordinates']:\n",
    "                    coordinate_to_key[coord] = reverse_mapping[answer]\n",
    "    \n",
    "    return coordinate_to_key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2025a892-e3e5-4efe-8822-2cbbd13f903c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{(0, 1): 'first_name', (0, 0): 'name', (3, 3): 'password', (4, 3): 'password'}\n"
     ]
    }
   ],
   "source": [
    "print(lookup_keys_from_results(result, faker_mappings))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}