File size: 4,427 Bytes
32e8749
 
 
 
 
 
a7f33dd
 
32e8749
49e21b1
cebee93
 
fba8174
32e8749
7e3c140
32e8749
a0332ee
 
 
 
32e8749
 
 
 
 
7e3c140
32e8749
0231c56
32e8749
7e3c140
c1c1765
32e8749
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fba8174
49e21b1
fba8174
c1c1765
fba8174
 
49e21b1
 
 
 
 
 
fba8174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0231c56
7ec7391
 
 
 
 
db9f5fe
7ec7391
cebee93
7ec7391
 
 
 
 
 
 
fba8174
7ec7391
4397a91
 
7ec7391
 
3e9afcd
7ec7391
 
 
 
 
4397a91
 
6ca040b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tf_keras as keras
import pandas as pd
from tensorflow.keras.models import load_model
import classifier_data_lib
import tokenization
import joblib
from deep_translator import GoogleTranslator
import sys
import os
import gradio as gr

model = load_model('ISCO-Coder-BERT.h5', custom_objects={'KerasLayer': hub.KerasLayer})

bert_layer = hub.KerasLayer("https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/1",trainable=True)

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file,do_lower_case)

# Parameters
max_seq_length = 128
dummy_label = 100
label_list = list(pd.read_excel('label_list.xlsx')['label_list'])

map_data = pd.read_excel("ISCO-08 EN Structure and definitions.xlsx")

label_encoder = joblib.load('label_encoder.joblib')

# Define a function to preprocess the new data
def get_feature_new(text, max_seq_length, tokenizer, dummy_label):
    example = classifier_data_lib.InputExample(guid=None,
                                               text_a=text.numpy().decode('utf-8'),
                                               text_b=None,
                                               label=dummy_label)  # Use a valid dummy label
    feature = classifier_data_lib.convert_single_example(0, example, label_list, max_seq_length, tokenizer)
    return feature.input_ids, feature.input_mask, feature.segment_ids

def get_feature_map_new(text):
    input_ids, input_mask, segment_ids = tf.py_function(
        lambda text: get_feature_new(text, max_seq_length, tokenizer, dummy_label),
        inp=[text],
        Tout=[tf.int32, tf.int32, tf.int32]
    )
    input_ids.set_shape([max_seq_length])
    input_mask.set_shape([max_seq_length])
    segment_ids.set_shape([max_seq_length])
    
    x = {'input_word_ids': input_ids,
         'input_mask': input_mask,
         'input_type_ids': segment_ids}
    
    return x

def preprocess_new_data(texts):
    dataset = tf.data.Dataset.from_tensor_slices((texts,))
    dataset = dataset.map(get_feature_map_new,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(32, drop_remainder=False)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    
    return dataset
def launch(text_input):
    # Load the label encoder
    #label_encoder = joblib.load('label_encoderV2.joblib')

    # Preprocess the new data
    try:
        text_input = GoogleTranslator(source = 'auto',target = 'en').translate(text_input)
    except:
        text_input = text_input
        
    sample_example = [text_input]
    new_data_dataset = preprocess_new_data(sample_example)

    # Assuming you have a model already loaded (add model loading code if needed)
    # Make predictions on the new data
    predictions = model.predict(new_data_dataset)

    # Decode the predictions
    predicted_classes = [label_list[np.argmax(pred)] for pred in predictions]

    # Calculate the highest probabilities
    highest_probabilities = [max(instance) for instance in predictions]

    # Decode labels using the label encoder
    decoded_labels = label_encoder.inverse_transform(predicted_classes)


    # Retrieve the ISCO description based on the decoded label
    isco_description = map_data[map_data['ISCO 08 Code'] == decoded_labels[0]]['Title EN'].values
    
    # Print for debugging (optional)
    print(f"Most likely ISCO code is {decoded_labels[0]} and probability is {highest_probabilities[0]}")
    print(text_input)
    
    # Create descriptive text for the output
    result_text = (
        f"Predicted ISCO Code: {decoded_labels[0]}\n"
        f"Probability: {highest_probabilities[0]:.2f}\n"
        f"ISCO Description: {isco_description[0] if len(isco_description) > 0 else 'Description not found'}"
    )
    
    return result_text

# Define the Gradio interface
iface = gr.Interface(
    fn=launch,
    inputs=gr.Textbox(
        lines=2, 
        placeholder="Enter job title in any language (e.g., Software Engineer) AND/OR description here (e.g., Develops and maintains software applications)..."
    ),
    outputs=gr.Textbox(
        lines=4, 
        placeholder="Predicted ISCO Code: <result>\nProbability: <result>\nISCO Description: <result>"
    )
)

iface.launch()