|
|
--- |
|
|
base_model: |
|
|
- alan-yahya/MatBERT |
|
|
pipeline_tag: feature-extraction |
|
|
tags: |
|
|
- chemistry |
|
|
- biology |
|
|
--- |
|
|
### ✨ [ACL 2025] Incorporating Domain Knowledge into Materials Tokenization |
|
|
|
|
|
You can find this paper in [here](https://arxiv.org/abs/2506.11115) |
|
|
|
|
|
|
|
|
If you only want to extract material concepts (material term or material formula) using **MatDetector**, please follow the steps below. |
|
|
|
|
|
```python |
|
|
import os |
|
|
import torch |
|
|
import torch.nn.functional as F |
|
|
from transformers import AutoTokenizer, BertForTokenClassification |
|
|
from tqdm import tqdm |
|
|
|
|
|
model_path = MatDetector_ckp |
|
|
# you can download matbert at https://github.com/lbnlp/MatBERT |
|
|
tokenizer_path = '/matbert-base-cased' |
|
|
input_file = 'TARGET.txt' |
|
|
output_directory = './' |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, do_lower_case=False) |
|
|
model = BertForTokenClassification.from_pretrained(model_path).half() |
|
|
|
|
|
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu") |
|
|
model.to(device) |
|
|
model.eval() |
|
|
|
|
|
label_map = {0: "O", 1: "B-matname", 2: "I-matname", 3: "B-mf", 4: "I-mf"} |
|
|
|
|
|
def process_single_word(word, tokenizer, model, device): |
|
|
tokenized = tokenizer(word, return_tensors="pt", truncation=True, max_length=128) |
|
|
input_ids = tokenized["input_ids"].to(device) |
|
|
attention_mask = tokenized["attention_mask"].to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model(input_ids=input_ids, attention_mask=attention_mask) |
|
|
logits = outputs.logits |
|
|
probabilities = F.softmax(logits, dim=2) # (batch_size=1, seq_len, num_labels) |
|
|
|
|
|
return tokenized, probabilities |
|
|
|
|
|
|
|
|
def determine_label(tokenized, probabilities, label_map): |
|
|
tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0].tolist()) |
|
|
probs = probabilities[0] # (seq_len, num_labels) |
|
|
|
|
|
token_labels = [] |
|
|
for token, prob in zip(tokens, probs): |
|
|
if token in ["[CLS]", "[SEP]", "[PAD]"]: |
|
|
continue |
|
|
|
|
|
clean_token = token[2:] if token.startswith("##") else token |
|
|
max_label = prob.argmax().item() |
|
|
label_name = label_map[max_label] |
|
|
|
|
|
token_labels.append(label_name) |
|
|
|
|
|
label_counts = {} |
|
|
for label in token_labels: |
|
|
if label not in label_counts: |
|
|
label_counts[label] = 0 |
|
|
label_counts[label] += 1 |
|
|
|
|
|
final_label = max(label_counts, key=label_counts.get) if label_counts else "O" |
|
|
|
|
|
return final_label |
|
|
|
|
|
|
|
|
with open(os.path.join(output_directory, "mf.txt"), "w") as mf_file, \ |
|
|
open(os.path.join(output_directory, "matname.txt"), "w") as matname_file, \ |
|
|
open(os.path.join(output_directory, "o_tags.txt"), "w") as o_file: |
|
|
|
|
|
with open(input_file, 'r') as file: |
|
|
lines = [line.strip() for line in file.readlines() if line.strip()] |
|
|
total_lines = len(lines) |
|
|
|
|
|
with tqdm(total=total_lines, desc="Processing words", unit="words") as progress_bar: |
|
|
for original_word in lines: |
|
|
tokenized, probabilities = process_single_word(original_word, tokenizer, model, device) |
|
|
final_label = determine_label(tokenized, probabilities, label_map) |
|
|
|
|
|
if final_label == "O": |
|
|
o_file.write(f"{original_word}\n") |
|
|
elif final_label in ["B-mf", "I-mf"]: |
|
|
mf_file.write(f"{original_word}\n") |
|
|
elif final_label in ["B-matname", "I-matname"]: |
|
|
matname_file.write(f"{original_word}\n") |
|
|
|
|
|
progress_bar.update(1) |
|
|
|
|
|
print("Processing completed. Files saved as mf.txt, matname.txt, and o_tags.txt.") |
|
|
|
|
|
``` |