{ "cells": [ { "cell_type": "markdown", "id": "6fdb2a6f", "metadata": {}, "source": [ "## INFERENCE" ] }, { "cell_type": "code", "execution_count": 3, "id": "a9f7c9cd", "metadata": {}, "outputs": [], "source": [ "from flair.models import SequenceTagger\n", "from flair.data import Sentence\n", "\n", "from tokenizer import StatsTokenizer\n", "from hypothesis import HypothesisTest" ] }, { "cell_type": "code", "execution_count": 4, "id": "a391d4d9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/vinay/opt/anaconda3/envs/lab/lib/python3.9/site-packages/huggingface_hub/file_download.py:629: FutureWarning: `cached_download` is the legacy way to download files from the HF hub, please consider upgrading to `hf_hub_download`\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2023-04-20 14:07:50,024 loading file /Users/vinay/.flair/models/stats-ner/36efff182e4649faa516d32ec20e0e565b874997ed6ee16de9cf7f4009a56ee3.09a8678c0f280ee4b018f8d418135a2f149e0ce74d4f61859f150e53b022dd29\n", "2023-04-20 14:07:50,368 SequenceTagger predicts: Dictionary with 11 tags: O, S-T, B-T, E-T, I-T, S-P, B-P, E-P, I-P, , \n" ] } ], "source": [ "# load the model from hugging face\n", "model = SequenceTagger.load(\"VinayNR/stats-nerd\")" ] }, { "cell_type": "code", "execution_count": 51, "id": "d4e55cb5", "metadata": {}, "outputs": [], "source": [ "# predict the tags on a sample file \n", "file_path = \"../sample.txt\"\n", "with open(file_path) as f:\n", " fileStr = f.read()" ] }, { "cell_type": "code", "execution_count": 46, "id": "fdb6df57", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Prediction begin\n", "Prediction done\n" ] } ], "source": [ "# predict tags on the sentence\n", "sentence = Sentence(fileStr.replace('\\n',' '), use_tokenizer=StatsTokenizer())\n", "print(\"Prediction begin\")\n", "res = model.predict(sentence)\n", "print(\"Prediction done\")" ] }, { "cell_type": "code", "execution_count": 47, "id": "847cabce", "metadata": {}, "outputs": [], "source": [ "# get reported statistical tests in the sentence\n", "reported_tests = HypothesisTest.get_reported_stat_tests(sentence)" ] }, { "cell_type": "code", "execution_count": 48, "id": "b38b8ab6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reported Statistics in the text above....\n", "--------------------\n", "Test Type : t | Test Stat : 1.45 | DF : 23 | Rep P-val : <0.01\n", "Calculated p-value : 0.08027960035102566\n", "\n", "\n" ] } ], "source": [ "print('Reported Statistics in the text above....')\n", "print('--------------------')\n", "for test in reported_tests:\n", " print(test)\n", " print('Calculated p-value : ', test.calculate_p_val())\n", " print('\\n')" ] }, { "cell_type": "code", "execution_count": 49, "id": "93b24840", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Span[1:9]: \"t ( df = 23 ) = 1.45\" → T (0.9833)\n", "Span[14:17]: \"p < 0.01\" → P (0.8529)\n" ] } ], "source": [ "for entity in sentence.get_labels('ner'):\n", " print(entity)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }