yerim0210
/

MatDetector

PyTorch

bert

Model card Files Files and versions

xet

Community

yerim0210 commited on Jun 17

Commit

cbe4c21

verified ·

1 Parent(s): bf1410e

Create README.md

Browse files

Files changed (1) hide show

README.md +95 -0

README.md ADDED Viewed

	@@ -0,0 +1,95 @@

+### ✨ [ACL 2025] Incorporating Domain Knowledge into Materials Tokenization
+You can find this paper in [here](https://arxiv.org/abs/2506.11115)
+If you only want to extract material concepts (material term or material formula) using **MatDetector**, please follow the steps below.
+```python
+import os
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, BertForTokenClassification
+from tqdm import tqdm
+model_path = MatDetector_ckp
+# you can download matbert at https://github.com/lbnlp/MatBERT
+tokenizer_path = '/matbert-base-cased'
+input_file = 'TARGET.txt'
+output_directory = './'
+tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, do_lower_case=False)
+model = BertForTokenClassification.from_pretrained(model_path).half()
+device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
+model.to(device)
+model.eval()
+label_map = {0: "O", 1: "B-matname", 2: "I-matname", 3: "B-mf", 4: "I-mf"}
+def process_single_word(word, tokenizer, model, device):
+    tokenized = tokenizer(word, return_tensors="pt", truncation=True, max_length=128)
+    input_ids = tokenized["input_ids"].to(device)
+    attention_mask = tokenized["attention_mask"].to(device)
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+    logits = outputs.logits
+    probabilities = F.softmax(logits, dim=2)  # (batch_size=1, seq_len, num_labels)
+    return tokenized, probabilities
+def determine_label(tokenized, probabilities, label_map):
+    tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0].tolist())
+    probs = probabilities[0]  # (seq_len, num_labels)
+    token_labels = []
+    for token, prob in zip(tokens, probs):
+        if token in ["[CLS]", "[SEP]", "[PAD]"]:
+            continue
+        clean_token = token[2:] if token.startswith("##") else token
+        max_label = prob.argmax().item()
+        label_name = label_map[max_label]
+        token_labels.append(label_name)
+    label_counts = {}
+    for label in token_labels:
+        if label not in label_counts:
+            label_counts[label] = 0
+        label_counts[label] += 1
+    final_label = max(label_counts, key=label_counts.get) if label_counts else "O"
+    return final_label
+with open(os.path.join(output_directory, "mf.txt"), "w") as mf_file, \
+     open(os.path.join(output_directory, "matname.txt"), "w") as matname_file, \
+     open(os.path.join(output_directory, "o_tags.txt"), "w") as o_file:
+    with open(input_file, 'r') as file:
+        lines = [line.strip() for line in file.readlines() if line.strip()]
+        total_lines = len(lines)
+        with tqdm(total=total_lines, desc="Processing words", unit="words") as progress_bar:
+            for original_word in lines:
+                tokenized, probabilities = process_single_word(original_word, tokenizer, model, device)
+                final_label = determine_label(tokenized, probabilities, label_map)
+                if final_label == "O":
+                    o_file.write(f"{original_word}\n")
+                elif final_label in ["B-mf", "I-mf"]:
+                    mf_file.write(f"{original_word}\n")
+                elif final_label in ["B-matname", "I-matname"]:
+                    matname_file.write(f"{original_word}\n")
+                progress_bar.update(1)
+print("Processing completed. Files saved as mf.txt, matname.txt, and o_tags.txt.")
+```