Spaces:

domingue-lab
/

reading_idm

Runtime error

App Files Files Community

martinakaduc commited on Dec 16, 2024

Commit

5e8ceb0

verified ·

1 Parent(s): 506102d

Upload 6 files

Browse files

Files changed (6) hide show

.gitattributes +1 -0
Dockerfile +5 -6
app.R +132 -49
bertembedtoy.py +70 -0
model.rds +3 -0
requirements.txt +3 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model.rds filter=lfs diff=lfs merge=lfs -text

Dockerfile CHANGED Viewed

@@ -2,13 +2,12 @@ FROM rocker/r-base:latest
 WORKDIR /code
 RUN install2.r --error \
     shiny \
-    dplyr \
-    ggplot2 \
-    readr \
-    ggExtra
-COPY . .
 CMD ["R", "--quiet", "-e", "shiny::runApp(host='0.0.0.0', port=7860)"]

 WORKDIR /code
+COPY . .
 RUN install2.r --error \
     shiny \
+    jsonlite
+RUN pip install -r requirements.txt
 CMD ["R", "--quiet", "-e", "shiny::runApp(host='0.0.0.0', port=7860)"]

app.R CHANGED Viewed

@@ -1,58 +1,141 @@
 library(shiny)
-library(bslib)
-library(dplyr)
-library(ggplot2)
-df <- readr::read_csv("penguins.csv")
-# Find subset of columns that are suitable for scatter plot
-df_num <- df |> select(where(is.numeric), -Year)
-ui <- page_sidebar(
-  theme = bs_theme(bootswatch = "minty"),
-  title = "Penguins explorer",
-  sidebar = sidebar(
-    varSelectInput("xvar", "X variable", df_num, selected = "Bill Length (mm)"),
-    varSelectInput("yvar", "Y variable", df_num, selected = "Bill Depth (mm)"),
-    checkboxGroupInput("species", "Filter by species",
-      choices = unique(df$Species), selected = unique(df$Species)
-    ),
-    hr(), # Add a horizontal rule
-    checkboxInput("by_species", "Show species", TRUE),
-    checkboxInput("show_margins", "Show marginal plots", TRUE),
-    checkboxInput("smooth", "Add smoother"),
   ),
-  plotOutput("scatter")
 )
 server <- function(input, output, session) {
-  subsetted <- reactive({
-    req(input$species)
-    df |> filter(Species %in% input$species)
   })
-  output$scatter <- renderPlot(
-    {
-      p <- ggplot(subsetted(), aes(!!input$xvar, !!input$yvar)) +
-        theme_light() +
-        list(
-          theme(legend.position = "bottom"),
-          if (input$by_species) aes(color = Species),
-          geom_point(),
-          if (input$smooth) geom_smooth()
-        )
-      if (input$show_margins) {
-        margin_type <- if (input$by_species) "density" else "histogram"
-        p <- p |> ggExtra::ggMarginal(
-          type = margin_type, margins = "both",
-          size = 8, groupColour = input$by_species, groupFill = input$by_species
-        )
-      }
-      p
-    },
-    res = 100
-  )
 }
-shinyApp(ui, server)

+# Load required libraries
 library(shiny)
+library(jsonlite)  # For JSON conversion
+# Use the system's Python executable or configure Python path
+python_path <- "python"
+model= readRDS("model.rds")
+# Sample data table to display
+difficulty_table <- data.frame(
+  Grade = c("Grade 3", "Grade 4", "Grade 5", "Grade 6", "Grade 7", "Grade 8"),
+  `Mean Grade-Level Difficulty` = c(0.3, 0.431, 0.533, 0.611, 0.656, 0.7)
+)
+# Define UI for the Shiny application
+ui <- fluidPage(
+  titlePanel("Reading comprehension difficulty prediction using BERT embeddings"),
+  fluidRow(
+    column(
+      width = 12,
+      h4("How does this work?"),
+      p("This app predicts average difficulty for an item. Difficulty can be interpreted using the table below. As the table shows, difficulty increases with grade level.
+        For example, an item of difficulty 0.3 is of average difficulty for Grade 3, an item of difficulty 0.4 is of average difficulty for Grade 4, and so on. Note that as difficulty increases, probability of correct answer reduces."),
+      # Display the table as a rendered output
+      h5(""),
+      tableOutput("difficultyTable"),
+      # Add external link for NWEA norms
+      p("Difficulty outputs are on a linear scale. The scale is defined to have mean difficulty of 0.3 at Grade 3 and mean difficulty 0.7 at Grade 8. This scale is based on grade level growth norms reported by ",
+        a("NWEA MAP Spring 2020 Reading Student Achievement Norms",
+          href = "https://www.nwea.org/uploads/MAP-Growth-Normative-Data-Overview.pdf", target = "_blank"))
+    )
   ),
+  # App UI elements
+  sidebarLayout(
+    sidebarPanel(
+      width = 12,
+      textInput("passage", "Passage", placeholder = "Enter passage text here"),
+      textInput("question", "Question Text", placeholder = "Enter question text here"),
+      textInput("correctAnswer", "Correct Answer", placeholder = "Enter the correct answer here"),
+      # Numeric input to ask how many incorrect options (distractors)
+      numericInput("numDistractors", "Number of Incorrect Options:", value = 1, min = 1, max = 10),
+      # Dynamic UI for the distractors
+      uiOutput("distractorsInputs"),
+      actionButton("printBtn", "Estimate difficulty")
+    ),
+    mainPanel(
+      h3("Estimated difficulty"),
+      verbatimTextOutput("inputsOutput")
+    )
+  )
 )
+# Function to call Python script and get combined input
+call_python_script <- function(passage, question, distractors) {
+  # Prepare input data as JSON
+  input_data <- toJSON(list(
+    Passage = passage,
+    QuestionText = question,
+    Distractors = distractors
+  ), auto_unbox = TRUE)
+  # Call the Python script and capture the output
+  result <- system2(
+    command = python_path,         # Python executable
+    args = c("bertembedtoy.py"),    # Python script name
+    input = input_data,          # Pass input data as JSON
+    stdout = TRUE                # Capture script output
+  )
+  # Return the result from Python
+  return(result)
+}
+# Read the CSV data into a dataframe
+  #embedding_df <- read.csv(text =result)
+# Define server logic
 server <- function(input, output, session) {
+  # Reactive value to store predictions
+  predictions <- reactiveVal(NULL)
+  #df <- reactiveVal(data.frame())
+  # Render the difficulty table in the UI
+  output$difficultyTable <- renderTable({
+    difficulty_table
+  }, rownames = FALSE)  # Avoid showing rownames in the table
+  # Dynamically generate text inputs for distractors based on user input
+  output$distractorsInputs <- renderUI({
+    n <- input$numDistractors  # Get the number of distractors
+    if (is.null(n) || n <= 0) return(NULL)  # No inputs if value is invalid
+    # Generate textInput fields dynamically
+    lapply(1:n, function(i) {
+      textInput(inputId = paste0("distractor", i),
+                label = paste("Wrong answer", i),
+                placeholder = paste("Enter wrong answer", i))
+    })
   })
+  # Event triggered when the button is clicked
+  observeEvent(input$printBtn, {
+    distractors <- c(sapply(1:input$numDistractors, function(i) {input[[paste0("distractor", i)]]}))
+    # Combine the distractors into a single string
+    for (i in 1:length(distractors)) {
+      distractors[i]= paste0("wrong answer ",i,":", distractors[i])
+    }
+    distractors <- c(paste0("Correct answer:",input$correctAnswer), distractors)
+    distractors=paste(distractors, collapse = " \n")
+    print(distractors)
+    # Call the Python script and capture the dataframe
+    python_output <- call_python_script(input$passage,
+                                        input$question,
+                                        distractors)
+    result_vector <- as.numeric(unlist(strsplit(python_output, ",")))
+    result_vector= as.data.frame(t(result_vector) )
+    names(result_vector) = paste0("embed.bert", 1:768)
+    model_predictions= predict(model, newdata=result_vector)
+    predictions(model_predictions)
+  })
+  # Output predictions to the UI (or use it elsewhere)
+  output$inputsOutput <- renderPrint({
+    predictions()
+  })
 }
+# Run the Shiny App
+shinyApp(ui = ui, server = server)

bertembedtoy.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import sys
+import json
+import transformers
+import torch
+def vectorize_with_pretrained_embeddings(sentences):
+    """
+    Produces a tensor containing a BERT embedding for each sentence in the dataset or in a
+    batch
+    Args:
+      sentences: List of sentences of length n
+    Returns:
+      embeddings: A 2D torch array containing embeddings for each of the n sentences (n x d)
+                  where d = 768
+    """
+    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
+    pretrained_model = transformers.BertModel.from_pretrained(
+        'bert-base-cased', output_hidden_states=False)
+    pretrained_model.eval()
+    embeddings = []
+    for sentence in sentences:
+        with_tags = "[CLS] " + sentence + " [SEP]"
+        tokenized_sentence = tokenizer.tokenize(with_tags)
+        tokenized_sentence = tokenized_sentence[:512]
+        # print(tokenized_sentence)
+        # print(len(tokenized_sentence))
+        indices_from_tokens = tokenizer.convert_tokens_to_ids(
+            tokenized_sentence)
+        segments_ids = [1] * len(indices_from_tokens)
+        tokens_tensor = torch.tensor([indices_from_tokens])
+        segments_tensors = torch.tensor([segments_ids])
+        # print(indices_from_tokens)
+        # print(tokens_tensor)
+        # print(segments_tensors)
+        with torch.no_grad():
+            outputs = pretrained_model(tokens_tensor, segments_tensors)[
+                0]  # The output is the
+            # last hidden state of the pretrained model of shape 1 x sentence_length x BERT embedding_length
+            # we average across the embedding length
+            embeddings.append(torch.mean(outputs, dim=1))
+            # dimension to produce constant sized tensors
+    # print(embeddings[0].shape)
+    embeddings = torch.cat(embeddings, dim=0)
+    # print('Shape of embeddings tensor (n x d = 768): ', embeddings.shape)
+    return embeddings.cpu().detach().numpy()
+def main():
+    # Step 1: Read JSON input from stdin
+    input_json = sys.stdin.read()
+    inputs = json.loads(input_json)
+    # Step 2: Extract inputs
+    passage = inputs.get("Passage", "")
+    question = inputs.get("QuestionText", "")
+    distractors = inputs.get("Distractors", "")
+    # Combine inputs
+    combined_input = [f"{question}\n{distractors}\n{passage}"]
+    # print(combined_input)
+    embedding = vectorize_with_pretrained_embeddings(combined_input)
+    embedding_flat = embedding.flatten()  # Flatten to a 1D array
+    embedding_str = ",".join(map(str, embedding_flat))
+    print(embedding_str)
+if __name__ == "__main__":
+    main()

model.rds ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:513cc1c15ee297b97cf8f29caf681a3a3c601f3cfa7df8cb21d68ade3ca58b3c
+size 10864973

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers==4.47.0
+torch==2.2.2
+numpy==1.26.4