Spaces:
Sleeping
Sleeping
| import re | |
| import pandas as pd | |
| def parse_text_file(text): | |
| # セクションを分割するための正規表現パターンを定義 | |
| # \d+ は1つ以上の数字にマッチします | |
| pattern = re.compile(r'\n\n\n\d+\.') | |
| # テキストをセクションごとに分割 | |
| sections = pattern.split(text)[1:] # 最初の空のセクションを除外 | |
| # 各セクションの前後の空白を削除 | |
| sections = [section.strip() for section in sections] | |
| return sections | |
| def split_sections(text): | |
| contents = text.split('\n\n') | |
| contents = [section.strip() for section in contents if section.strip()] | |
| if len(contents) == 8 : | |
| keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI', 'COI'] | |
| elif len(contents) == 7 : | |
| keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'Copyrights', 'DOI'] | |
| elif len(contents) == 6: | |
| keys = ['PublishInfo', 'Title', 'AuthorName', 'AuthorInfo', 'Abstract', 'DOI'] | |
| elif len(contents) == 5: | |
| keys = ['PublishInfo', 'Title', 'AuthorName', 'Abstract', 'DOI'] | |
| # 辞書を作成し、キーが存在しない場合は空の文字列を設定 | |
| section_dict = {key: contents[i] if i < len(contents) else "" for i, key in enumerate(keys)} | |
| return section_dict | |
| def GetSummaryDf(textdir): | |
| with open(textdir, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| sections = parse_text_file(content) | |
| dicts = [] | |
| for section in sections: | |
| splited_dic = split_sections(section) | |
| dicts.append(splited_dic) | |
| return pd.DataFrame(dicts) |