martinakaduc commited on
Commit
5e8ceb0
·
verified ·
1 Parent(s): 506102d

Upload 6 files

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +5 -6
  3. app.R +132 -49
  4. bertembedtoy.py +70 -0
  5. model.rds +3 -0
  6. requirements.txt +3 -0
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ model.rds filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -2,13 +2,12 @@ FROM rocker/r-base:latest
2
 
3
  WORKDIR /code
4
 
 
 
5
  RUN install2.r --error \
6
  shiny \
7
- dplyr \
8
- ggplot2 \
9
- readr \
10
- ggExtra
11
-
12
- COPY . .
13
 
14
  CMD ["R", "--quiet", "-e", "shiny::runApp(host='0.0.0.0', port=7860)"]
 
2
 
3
  WORKDIR /code
4
 
5
+ COPY . .
6
+
7
  RUN install2.r --error \
8
  shiny \
9
+ jsonlite
10
+
11
+ RUN pip install -r requirements.txt
 
 
 
12
 
13
  CMD ["R", "--quiet", "-e", "shiny::runApp(host='0.0.0.0', port=7860)"]
app.R CHANGED
@@ -1,58 +1,141 @@
 
1
  library(shiny)
2
- library(bslib)
3
- library(dplyr)
4
- library(ggplot2)
5
-
6
- df <- readr::read_csv("penguins.csv")
7
- # Find subset of columns that are suitable for scatter plot
8
- df_num <- df |> select(where(is.numeric), -Year)
9
-
10
- ui <- page_sidebar(
11
- theme = bs_theme(bootswatch = "minty"),
12
- title = "Penguins explorer",
13
- sidebar = sidebar(
14
- varSelectInput("xvar", "X variable", df_num, selected = "Bill Length (mm)"),
15
- varSelectInput("yvar", "Y variable", df_num, selected = "Bill Depth (mm)"),
16
- checkboxGroupInput("species", "Filter by species",
17
- choices = unique(df$Species), selected = unique(df$Species)
18
- ),
19
- hr(), # Add a horizontal rule
20
- checkboxInput("by_species", "Show species", TRUE),
21
- checkboxInput("show_margins", "Show marginal plots", TRUE),
22
- checkboxInput("smooth", "Add smoother"),
 
 
 
 
 
 
 
 
 
 
23
  ),
24
- plotOutput("scatter")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  )
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  server <- function(input, output, session) {
28
- subsetted <- reactive({
29
- req(input$species)
30
- df |> filter(Species %in% input$species)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  })
32
 
33
- output$scatter <- renderPlot(
34
- {
35
- p <- ggplot(subsetted(), aes(!!input$xvar, !!input$yvar)) +
36
- theme_light() +
37
- list(
38
- theme(legend.position = "bottom"),
39
- if (input$by_species) aes(color = Species),
40
- geom_point(),
41
- if (input$smooth) geom_smooth()
42
- )
43
-
44
- if (input$show_margins) {
45
- margin_type <- if (input$by_species) "density" else "histogram"
46
- p <- p |> ggExtra::ggMarginal(
47
- type = margin_type, margins = "both",
48
- size = 8, groupColour = input$by_species, groupFill = input$by_species
49
- )
50
- }
51
-
52
- p
53
- },
54
- res = 100
55
- )
 
 
 
 
 
56
  }
57
 
58
- shinyApp(ui, server)
 
 
1
+ # Load required libraries
2
  library(shiny)
3
+ library(jsonlite) # For JSON conversion
4
+
5
+ # Use the system's Python executable or configure Python path
6
+ python_path <- "python"
7
+ model= readRDS("model.rds")
8
+
9
+ # Sample data table to display
10
+ difficulty_table <- data.frame(
11
+ Grade = c("Grade 3", "Grade 4", "Grade 5", "Grade 6", "Grade 7", "Grade 8"),
12
+ `Mean Grade-Level Difficulty` = c(0.3, 0.431, 0.533, 0.611, 0.656, 0.7)
13
+ )
14
+
15
+ # Define UI for the Shiny application
16
+ ui <- fluidPage(
17
+ titlePanel("Reading comprehension difficulty prediction using BERT embeddings"),
18
+ fluidRow(
19
+ column(
20
+ width = 12,
21
+ h4("How does this work?"),
22
+ p("This app predicts average difficulty for an item. Difficulty can be interpreted using the table below. As the table shows, difficulty increases with grade level.
23
+ For example, an item of difficulty 0.3 is of average difficulty for Grade 3, an item of difficulty 0.4 is of average difficulty for Grade 4, and so on. Note that as difficulty increases, probability of correct answer reduces."),
24
+
25
+ # Display the table as a rendered output
26
+ h5(""),
27
+ tableOutput("difficultyTable"),
28
+
29
+ # Add external link for NWEA norms
30
+ p("Difficulty outputs are on a linear scale. The scale is defined to have mean difficulty of 0.3 at Grade 3 and mean difficulty 0.7 at Grade 8. This scale is based on grade level growth norms reported by ",
31
+ a("NWEA MAP Spring 2020 Reading Student Achievement Norms",
32
+ href = "https://www.nwea.org/uploads/MAP-Growth-Normative-Data-Overview.pdf", target = "_blank"))
33
+ )
34
  ),
35
+
36
+ # App UI elements
37
+ sidebarLayout(
38
+ sidebarPanel(
39
+ width = 12,
40
+ textInput("passage", "Passage", placeholder = "Enter passage text here"),
41
+ textInput("question", "Question Text", placeholder = "Enter question text here"),
42
+ textInput("correctAnswer", "Correct Answer", placeholder = "Enter the correct answer here"),
43
+
44
+ # Numeric input to ask how many incorrect options (distractors)
45
+ numericInput("numDistractors", "Number of Incorrect Options:", value = 1, min = 1, max = 10),
46
+
47
+ # Dynamic UI for the distractors
48
+ uiOutput("distractorsInputs"),
49
+
50
+ actionButton("printBtn", "Estimate difficulty")
51
+ ),
52
+
53
+ mainPanel(
54
+ h3("Estimated difficulty"),
55
+ verbatimTextOutput("inputsOutput")
56
+ )
57
+ )
58
  )
59
 
60
+ # Function to call Python script and get combined input
61
+ call_python_script <- function(passage, question, distractors) {
62
+ # Prepare input data as JSON
63
+ input_data <- toJSON(list(
64
+ Passage = passage,
65
+ QuestionText = question,
66
+ Distractors = distractors
67
+ ), auto_unbox = TRUE)
68
+
69
+
70
+ # Call the Python script and capture the output
71
+ result <- system2(
72
+ command = python_path, # Python executable
73
+ args = c("bertembedtoy.py"), # Python script name
74
+ input = input_data, # Pass input data as JSON
75
+ stdout = TRUE # Capture script output
76
+ )
77
+
78
+ # Return the result from Python
79
+ return(result)
80
+ }
81
+
82
+
83
+ # Read the CSV data into a dataframe
84
+ #embedding_df <- read.csv(text =result)
85
+ # Define server logic
86
  server <- function(input, output, session) {
87
+ # Reactive value to store predictions
88
+ predictions <- reactiveVal(NULL)
89
+
90
+ #df <- reactiveVal(data.frame())
91
+
92
+ # Render the difficulty table in the UI
93
+ output$difficultyTable <- renderTable({
94
+ difficulty_table
95
+ }, rownames = FALSE) # Avoid showing rownames in the table
96
+
97
+ # Dynamically generate text inputs for distractors based on user input
98
+ output$distractorsInputs <- renderUI({
99
+ n <- input$numDistractors # Get the number of distractors
100
+ if (is.null(n) || n <= 0) return(NULL) # No inputs if value is invalid
101
+
102
+ # Generate textInput fields dynamically
103
+ lapply(1:n, function(i) {
104
+ textInput(inputId = paste0("distractor", i),
105
+ label = paste("Wrong answer", i),
106
+ placeholder = paste("Enter wrong answer", i))
107
+ })
108
  })
109
 
110
+
111
+ # Event triggered when the button is clicked
112
+ observeEvent(input$printBtn, {
113
+ distractors <- c(sapply(1:input$numDistractors, function(i) {input[[paste0("distractor", i)]]}))
114
+ # Combine the distractors into a single string
115
+ for (i in 1:length(distractors)) {
116
+ distractors[i]= paste0("wrong answer ",i,":", distractors[i])
117
+ }
118
+ distractors <- c(paste0("Correct answer:",input$correctAnswer), distractors)
119
+ distractors=paste(distractors, collapse = " \n")
120
+ print(distractors)
121
+
122
+ # Call the Python script and capture the dataframe
123
+ python_output <- call_python_script(input$passage,
124
+ input$question,
125
+ distractors)
126
+ result_vector <- as.numeric(unlist(strsplit(python_output, ",")))
127
+ result_vector= as.data.frame(t(result_vector) )
128
+ names(result_vector) = paste0("embed.bert", 1:768)
129
+ model_predictions= predict(model, newdata=result_vector)
130
+ predictions(model_predictions)
131
+
132
+ })
133
+
134
+ # Output predictions to the UI (or use it elsewhere)
135
+ output$inputsOutput <- renderPrint({
136
+ predictions()
137
+ })
138
  }
139
 
140
+ # Run the Shiny App
141
+ shinyApp(ui = ui, server = server)
bertembedtoy.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import json
3
+ import transformers
4
+ import torch
5
+
6
+
7
+ def vectorize_with_pretrained_embeddings(sentences):
8
+ """
9
+ Produces a tensor containing a BERT embedding for each sentence in the dataset or in a
10
+ batch
11
+ Args:
12
+ sentences: List of sentences of length n
13
+ Returns:
14
+ embeddings: A 2D torch array containing embeddings for each of the n sentences (n x d)
15
+ where d = 768
16
+ """
17
+
18
+ tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
19
+ pretrained_model = transformers.BertModel.from_pretrained(
20
+ 'bert-base-cased', output_hidden_states=False)
21
+ pretrained_model.eval()
22
+ embeddings = []
23
+ for sentence in sentences:
24
+ with_tags = "[CLS] " + sentence + " [SEP]"
25
+ tokenized_sentence = tokenizer.tokenize(with_tags)
26
+ tokenized_sentence = tokenized_sentence[:512]
27
+ # print(tokenized_sentence)
28
+ # print(len(tokenized_sentence))
29
+ indices_from_tokens = tokenizer.convert_tokens_to_ids(
30
+ tokenized_sentence)
31
+ segments_ids = [1] * len(indices_from_tokens)
32
+ tokens_tensor = torch.tensor([indices_from_tokens])
33
+ segments_tensors = torch.tensor([segments_ids])
34
+ # print(indices_from_tokens)
35
+ # print(tokens_tensor)
36
+ # print(segments_tensors)
37
+ with torch.no_grad():
38
+ outputs = pretrained_model(tokens_tensor, segments_tensors)[
39
+ 0] # The output is the
40
+ # last hidden state of the pretrained model of shape 1 x sentence_length x BERT embedding_length
41
+ # we average across the embedding length
42
+ embeddings.append(torch.mean(outputs, dim=1))
43
+ # dimension to produce constant sized tensors
44
+ # print(embeddings[0].shape)
45
+ embeddings = torch.cat(embeddings, dim=0)
46
+ # print('Shape of embeddings tensor (n x d = 768): ', embeddings.shape)
47
+ return embeddings.cpu().detach().numpy()
48
+
49
+
50
+ def main():
51
+ # Step 1: Read JSON input from stdin
52
+ input_json = sys.stdin.read()
53
+ inputs = json.loads(input_json)
54
+
55
+ # Step 2: Extract inputs
56
+ passage = inputs.get("Passage", "")
57
+ question = inputs.get("QuestionText", "")
58
+ distractors = inputs.get("Distractors", "")
59
+
60
+ # Combine inputs
61
+ combined_input = [f"{question}\n{distractors}\n{passage}"]
62
+ # print(combined_input)
63
+ embedding = vectorize_with_pretrained_embeddings(combined_input)
64
+ embedding_flat = embedding.flatten() # Flatten to a 1D array
65
+ embedding_str = ",".join(map(str, embedding_flat))
66
+ print(embedding_str)
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
model.rds ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:513cc1c15ee297b97cf8f29caf681a3a3c601f3cfa7df8cb21d68ade3ca58b3c
3
+ size 10864973
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers==4.47.0
2
+ torch==2.2.2
3
+ numpy==1.26.4