import streamlit as st

# Page configuration
st.set_page_config(
    layout="wide", 
    initial_sidebar_state="auto"
)

# Custom CSS for better styling
st.markdown("""
    <style>
        .main-title {
            font-size: 36px;
            color: #4A90E2;
            font-weight: bold;
            text-align: center;
        }
        .sub-title {
            font-size: 24px;
            color: #4A90E2;
            margin-top: 20px;
        }
        .section {
            background-color: #f9f9f9;
            padding: 15px;
            border-radius: 10px;
            margin-top: 20px;
        }
        .section h2 {
            font-size: 22px;
            color: #4A90E2;
        }
        .section p, .section ul {
            color: #666666;
        }
        .link {
            color: #4A90E2;
            text-decoration: none;
        }
        .benchmark-table {
            width: 100%;
            border-collapse: collapse;
            margin-top: 20px;
        }
        .benchmark-table th, .benchmark-table td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: left;
        }
        .benchmark-table th {
            background-color: #4A90E2;
            color: white;
        }
        .benchmark-table td {
            background-color: #f2f2f2;
        }
    </style>
""", unsafe_allow_html=True)

# Title
st.markdown('<div class="main-title">Introduction to CamemBERT Annotators in Spark NLP</div>', unsafe_allow_html=True)

# Subtitle
st.markdown("""
<div class="section">
    <p>Spark NLP offers a variety of CamemBERT-based annotators tailored for multiple natural language processing tasks. CamemBERT is a robust and versatile model designed specifically for the French language, offering state-of-the-art performance in a range of NLP applications. Below, we provide an overview of the four key CamemBERT annotators:</p>
</div>
""", unsafe_allow_html=True)

st.markdown("""
<div class="section">
    <h2>CamemBERT for Token Classification</h2>
    <p>The <strong>CamemBertForTokenClassification</strong> annotator is designed for Named Entity Recognition (NER) tasks using CamemBERT, a French language model derived from RoBERTa. This model efficiently handles token classification, which involves labeling tokens in a text with tags that correspond to specific entities. CamemBERT offers robust performance in French NLP tasks, making it a valuable tool for real-time applications in this language.</p>
    <p>Token classification with CamemBERT enables:</p>
    <ul>
        <li><strong>Named Entity Recognition (NER):</strong> Identifying and classifying entities such as names, organizations, locations, and other predefined categories.</li>
        <li><strong>Information Extraction:</strong> Extracting key information from unstructured text for further analysis.</li>
        <li><strong>Text Categorization:</strong> Enhancing document retrieval and categorization based on entity recognition.</li>
    </ul>
    <p>Here is an example of how CamemBERT token classification works:</p>
    <table class="benchmark-table">
        <tr>
            <th>Entity</th>
            <th>Label</th>
        </tr>
        <tr>
            <td>Paris</td>
            <td>LOC</td>
        </tr>
        <tr>
            <td>Emmanuel Macron</td>
            <td>PER</td>
        </tr>
        <tr>
            <td>Élysée Palace</td>
            <td>ORG</td>
        </tr>
    </table>
</div>
""", unsafe_allow_html=True)

# CamemBERT Token Classification - French WikiNER
st.markdown('<div class="sub-title">CamemBERT Token Classification - French WikiNER</div>', unsafe_allow_html=True)
st.markdown("""
<div class="section">
    <p>The <strong>camembert_base_token_classifier_wikiner</strong> is a fine-tuned CamemBERT model for token classification tasks, specifically adapted for Named Entity Recognition (NER) on the French WikiNER dataset. It is designed to recognize five types of entities: O, LOC, PER, MISC, and ORG.</p>
</div>
""", unsafe_allow_html=True)

# How to Use the Model - Token Classification
st.markdown('<div class="sub-title">How to Use the Model</div>', unsafe_allow_html=True)
st.code('''
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, expr

document_assembler = DocumentAssembler() \\
    .setInputCol('text') \\
    .setOutputCol('document')

tokenizer = Tokenizer() \\
    .setInputCols(['document']) \\
    .setOutputCol('token')

tokenClassifier = CamemBertForTokenClassification \\
    .pretrained('camembert_base_token_classifier_wikiner', 'en') \\
    .setInputCols(['document', 'token']) \\
    .setOutputCol('ner') \\
    .setCaseSensitive(True) \\
    .setMaxSentenceLength(512)

# Convert NER labels to entities
ner_converter = NerConverter() \\
    .setInputCols(['document', 'token', 'ner']) \\
    .setOutputCol('entities')

pipeline = Pipeline(stages=[
    document_assembler,
    tokenizer,
    tokenClassifier,
    ner_converter
])

data = spark.createDataFrame([["""Paris est la capitale de la France et abrite le Président Emmanuel Macron, qui réside au palais de l'Élysée. Apple Inc. a une présence significative dans la ville."""]]).toDF("text")
result = pipeline.fit(data).transform(data)

result.select(
    expr("explode(entities) as ner_chunk")
).select(
    col("ner_chunk.result").alias("chunk"),
    col("ner_chunk.metadata.entity").alias("ner_label")
).show(truncate=False)
''', language='python')

# Results
st.text("""
+------------------+---------+
|chunk             |ner_label|
+------------------+---------+
|Paris             |LOC      |
|France            |LOC      |
|Emmanuel Macron   |PER      |
|Élysée Palace     |ORG      |
|Apple Inc.        |ORG      |
+------------------+---------+
""")

# Performance Metrics
st.markdown('<div class="sub-title">Performance Metrics</div>', unsafe_allow_html=True)
st.markdown("""
<div class="section">
    <p>Here are the detailed performance metrics for the CamemBERT token classification model:</p>
    <table class="benchmark-table">
        <tr>
            <th>Entity</th>
            <th>Precision</th>
            <th>Recall</th>
            <th>F1-Score</th>
        </tr>
        <tr>
            <td>LOC</td>
            <td>0.93</td>
            <td>0.94</td>
            <td>0.94</td>
        </tr>
        <tr>
            <td>PER</td>
            <td>0.95</td>
            <td>0.95</td>
            <td>0.95</td>
        </tr>
        <tr>
            <td>ORG</td>
            <td>0.92</td>
            <td>0.91</td>
            <td>0.91</td>
        </tr>
        <tr>
            <td>MISC</td>
            <td>0.86</td>
            <td>0.85</td>
            <td>0.85</td>
        </tr>
        <tr>
            <td>O</td>
            <td>0.99</td>
            <td>0.99</td>
            <td>0.99</td>
        </tr>
        <tr>
            <td>Overall</td>
            <td>0.97</td>
            <td>0.98</td>
            <td>0.98</td>
        </tr>
    </table>
</div>
""", unsafe_allow_html=True)

# Model Information - Token Classification
st.markdown('<div class="sub-title">Model Information</div>', unsafe_allow_html=True)
st.markdown("""
<div class="section">
    <ul>
        <li><strong>Model Name:</strong> camembert_base_token_classifier_wikiner</li>
        <li><strong>Compatibility:</strong> Spark NLP 4.2.0+</li>
        <li><strong>License:</strong> Open Source</li>
        <li><strong>Edition:</strong> Official</li>
        <li><strong>Input Labels:</strong> [token, document]</li>
        <li><strong>Output Labels:</strong> [ner]</li>
        <li><strong>Language:</strong> French</li>
        <li><strong>Size:</strong> 412.2 MB</li>
        <li><strong>Case Sensitive:</strong> Yes</li>
        <li><strong>Max Sentence Length:</strong> 512</li>
    </ul>
</div>
""", unsafe_allow_html=True)

# References - Token Classification
st.markdown('<div class="sub-title">References</div>', unsafe_allow_html=True)
st.markdown("""
<div class="section">
    <ul>
        <li><a class="link" href="https://huggingface.co/datasets/Jean-Baptiste/wikiner_fr" target="_blank" rel="noopener">CamemBERT WikiNER Dataset</a></li>
        <li><a class="link" href="https://sparknlp.org/2022/09/23/camembert_base_token_classifier_wikiner_en.html" target="_blank" rel="noopener">CamemBERT Token Classification on Spark NLP Hub</a></li>
    </ul>
</div>
""", unsafe_allow_html=True)