Spaces:
Running
Running
| import argparse | |
| import csv | |
| import json | |
| from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification | |
| MODEL = "d4data/biomedical-ner-all" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL) | |
| model = AutoModelForTokenClassification.from_pretrained(MODEL) | |
| pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") | |
| def process(*args): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--notes', help='Notes CSV', required=True) | |
| parser.add_argument('--out', help='Output', required=True) | |
| args = parser.parse_args() | |
| filepath = args.notes | |
| outpath = args.out | |
| if not filepath.endswith(".csv"): | |
| raise ValueError("Filepath must be a .csv file.") | |
| if not outpath.endswith(".json"): | |
| raise ValueError("Output path must be a .json file.") | |
| processed = [] | |
| with open(filepath, "r") as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| text = row["text"] | |
| raw = pipe(text) | |
| # do something with `raw` here e.g. save to file | |
| ner_content = { | |
| # "text": text, | |
| "score": row["score"], | |
| "student_id": row["student_id"], | |
| "case": row["case"], | |
| "entities": [ | |
| { | |
| "entity": x["entity_group"], | |
| "word": x["word"], | |
| "score": round(float(x["score"]), 2), | |
| "start": x["start"], | |
| "end": x["end"], | |
| } | |
| for x in raw | |
| ], | |
| } | |
| processed.append(ner_content) | |
| # write as json to file | |
| with open(outpath, "w") as f: | |
| json.dump(processed, f) | |
| if __name__ == "__main__": | |
| import sys | |
| process(*sys.argv[1:]) | |