YAML Metadata
Warning:
empty or missing yaml metadata in repo card
(https://huggingface.co/docs/hub/model-cards#model-card-metadata)
✨ [ACL 2025] Incorporating Domain Knowledge into Materials Tokenization
You can find this paper in here
If you only want to extract material concepts (material term or material formula) using MatDetector, please follow the steps below.
import os
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, BertForTokenClassification
from tqdm import tqdm
model_path = MatDetector_ckp
# you can download matbert at https://github.com/lbnlp/MatBERT
tokenizer_path = '/matbert-base-cased'
input_file = 'TARGET.txt'
output_directory = './'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=False, do_lower_case=False)
model = BertForTokenClassification.from_pretrained(model_path).half()
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
label_map = {0: "O", 1: "B-matname", 2: "I-matname", 3: "B-mf", 4: "I-mf"}
def process_single_word(word, tokenizer, model, device):
tokenized = tokenizer(word, return_tensors="pt", truncation=True, max_length=128)
input_ids = tokenized["input_ids"].to(device)
attention_mask = tokenized["attention_mask"].to(device)
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
probabilities = F.softmax(logits, dim=2) # (batch_size=1, seq_len, num_labels)
return tokenized, probabilities
def determine_label(tokenized, probabilities, label_map):
tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0].tolist())
probs = probabilities[0] # (seq_len, num_labels)
token_labels = []
for token, prob in zip(tokens, probs):
if token in ["[CLS]", "[SEP]", "[PAD]"]:
continue
clean_token = token[2:] if token.startswith("##") else token
max_label = prob.argmax().item()
label_name = label_map[max_label]
token_labels.append(label_name)
label_counts = {}
for label in token_labels:
if label not in label_counts:
label_counts[label] = 0
label_counts[label] += 1
final_label = max(label_counts, key=label_counts.get) if label_counts else "O"
return final_label
with open(os.path.join(output_directory, "mf.txt"), "w") as mf_file, \
open(os.path.join(output_directory, "matname.txt"), "w") as matname_file, \
open(os.path.join(output_directory, "o_tags.txt"), "w") as o_file:
with open(input_file, 'r') as file:
lines = [line.strip() for line in file.readlines() if line.strip()]
total_lines = len(lines)
with tqdm(total=total_lines, desc="Processing words", unit="words") as progress_bar:
for original_word in lines:
tokenized, probabilities = process_single_word(original_word, tokenizer, model, device)
final_label = determine_label(tokenized, probabilities, label_map)
if final_label == "O":
o_file.write(f"{original_word}\n")
elif final_label in ["B-mf", "I-mf"]:
mf_file.write(f"{original_word}\n")
elif final_label in ["B-matname", "I-matname"]:
matname_file.write(f"{original_word}\n")
progress_bar.update(1)
print("Processing completed. Files saved as mf.txt, matname.txt, and o_tags.txt.")
- Downloads last month
- 4
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
🙋
Ask for provider support