MatDetector / README.md
yerim0210's picture
Update README.md
79fb221 verified
---
base_model:
- alan-yahya/MatBERT
pipeline_tag: feature-extraction
tags:
- chemistry
- biology
---
### ✨ [ACL 2025] Incorporating Domain Knowledge into Materials Tokenization
You can find this paper in [here](https://arxiv.org/abs/2506.11115)
If you only want to extract material concepts (material term or material formula) using **MatDetector**, please follow the steps below.
```python
import os
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, BertForTokenClassification
from tqdm import tqdm
model_path = MatDetector_ckp
# you can download matbert at https://github.com/lbnlp/MatBERT
tokenizer_path = '/matbert-base-cased'
input_file = 'TARGET.txt'
output_directory = './'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, do_lower_case=False)
model = BertForTokenClassification.from_pretrained(model_path).half()
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
label_map = {0: "O", 1: "B-matname", 2: "I-matname", 3: "B-mf", 4: "I-mf"}
def process_single_word(word, tokenizer, model, device):
tokenized = tokenizer(word, return_tensors="pt", truncation=True, max_length=128)
input_ids = tokenized["input_ids"].to(device)
attention_mask = tokenized["attention_mask"].to(device)
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
probabilities = F.softmax(logits, dim=2) # (batch_size=1, seq_len, num_labels)
return tokenized, probabilities
def determine_label(tokenized, probabilities, label_map):
tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0].tolist())
probs = probabilities[0] # (seq_len, num_labels)
token_labels = []
for token, prob in zip(tokens, probs):
if token in ["[CLS]", "[SEP]", "[PAD]"]:
continue
clean_token = token[2:] if token.startswith("##") else token
max_label = prob.argmax().item()
label_name = label_map[max_label]
token_labels.append(label_name)
label_counts = {}
for label in token_labels:
if label not in label_counts:
label_counts[label] = 0
label_counts[label] += 1
final_label = max(label_counts, key=label_counts.get) if label_counts else "O"
return final_label
with open(os.path.join(output_directory, "mf.txt"), "w") as mf_file, \
open(os.path.join(output_directory, "matname.txt"), "w") as matname_file, \
open(os.path.join(output_directory, "o_tags.txt"), "w") as o_file:
with open(input_file, 'r') as file:
lines = [line.strip() for line in file.readlines() if line.strip()]
total_lines = len(lines)
with tqdm(total=total_lines, desc="Processing words", unit="words") as progress_bar:
for original_word in lines:
tokenized, probabilities = process_single_word(original_word, tokenizer, model, device)
final_label = determine_label(tokenized, probabilities, label_map)
if final_label == "O":
o_file.write(f"{original_word}\n")
elif final_label in ["B-mf", "I-mf"]:
mf_file.write(f"{original_word}\n")
elif final_label in ["B-matname", "I-matname"]:
matname_file.write(f"{original_word}\n")
progress_bar.update(1)
print("Processing completed. Files saved as mf.txt, matname.txt, and o_tags.txt.")
```