Spaces:
Sleeping
Sleeping
File size: 4,427 Bytes
32e8749 a7f33dd 32e8749 49e21b1 cebee93 fba8174 32e8749 7e3c140 32e8749 a0332ee 32e8749 7e3c140 32e8749 0231c56 32e8749 7e3c140 c1c1765 32e8749 fba8174 49e21b1 fba8174 c1c1765 fba8174 49e21b1 fba8174 0231c56 7ec7391 db9f5fe 7ec7391 cebee93 7ec7391 fba8174 7ec7391 4397a91 7ec7391 3e9afcd 7ec7391 4397a91 6ca040b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tf_keras as keras
import pandas as pd
from tensorflow.keras.models import load_model
import classifier_data_lib
import tokenization
import joblib
from deep_translator import GoogleTranslator
import sys
import os
import gradio as gr
model = load_model('ISCO-Coder-BERT.h5', custom_objects={'KerasLayer': hub.KerasLayer})
bert_layer = hub.KerasLayer("https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/1",trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file,do_lower_case)
# Parameters
max_seq_length = 128
dummy_label = 100
label_list = list(pd.read_excel('label_list.xlsx')['label_list'])
map_data = pd.read_excel("ISCO-08 EN Structure and definitions.xlsx")
label_encoder = joblib.load('label_encoder.joblib')
# Define a function to preprocess the new data
def get_feature_new(text, max_seq_length, tokenizer, dummy_label):
example = classifier_data_lib.InputExample(guid=None,
text_a=text.numpy().decode('utf-8'),
text_b=None,
label=dummy_label) # Use a valid dummy label
feature = classifier_data_lib.convert_single_example(0, example, label_list, max_seq_length, tokenizer)
return feature.input_ids, feature.input_mask, feature.segment_ids
def get_feature_map_new(text):
input_ids, input_mask, segment_ids = tf.py_function(
lambda text: get_feature_new(text, max_seq_length, tokenizer, dummy_label),
inp=[text],
Tout=[tf.int32, tf.int32, tf.int32]
)
input_ids.set_shape([max_seq_length])
input_mask.set_shape([max_seq_length])
segment_ids.set_shape([max_seq_length])
x = {'input_word_ids': input_ids,
'input_mask': input_mask,
'input_type_ids': segment_ids}
return x
def preprocess_new_data(texts):
dataset = tf.data.Dataset.from_tensor_slices((texts,))
dataset = dataset.map(get_feature_map_new,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.batch(32, drop_remainder=False)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
return dataset
def launch(text_input):
# Load the label encoder
#label_encoder = joblib.load('label_encoderV2.joblib')
# Preprocess the new data
try:
text_input = GoogleTranslator(source = 'auto',target = 'en').translate(text_input)
except:
text_input = text_input
sample_example = [text_input]
new_data_dataset = preprocess_new_data(sample_example)
# Assuming you have a model already loaded (add model loading code if needed)
# Make predictions on the new data
predictions = model.predict(new_data_dataset)
# Decode the predictions
predicted_classes = [label_list[np.argmax(pred)] for pred in predictions]
# Calculate the highest probabilities
highest_probabilities = [max(instance) for instance in predictions]
# Decode labels using the label encoder
decoded_labels = label_encoder.inverse_transform(predicted_classes)
# Retrieve the ISCO description based on the decoded label
isco_description = map_data[map_data['ISCO 08 Code'] == decoded_labels[0]]['Title EN'].values
# Print for debugging (optional)
print(f"Most likely ISCO code is {decoded_labels[0]} and probability is {highest_probabilities[0]}")
print(text_input)
# Create descriptive text for the output
result_text = (
f"Predicted ISCO Code: {decoded_labels[0]}\n"
f"Probability: {highest_probabilities[0]:.2f}\n"
f"ISCO Description: {isco_description[0] if len(isco_description) > 0 else 'Description not found'}"
)
return result_text
# Define the Gradio interface
iface = gr.Interface(
fn=launch,
inputs=gr.Textbox(
lines=2,
placeholder="Enter job title in any language (e.g., Software Engineer) AND/OR description here (e.g., Develops and maintains software applications)..."
),
outputs=gr.Textbox(
lines=4,
placeholder="Predicted ISCO Code: <result>\nProbability: <result>\nISCO Description: <result>"
)
)
iface.launch()
|