Spaces:
Running
Running
# -*- coding: utf-8 -*- | |
# @Time : 2021/12/2 5:41 p.m. | |
# @Author : JianingWang | |
# @File : common.py | |
def is_chinese_char(cp): | |
"""Checks whether CP is the codepoint of a CJK character.""" | |
# This defines a "chinese character" as anything in the CJK Unicode block: | |
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) | |
# | |
# Note that the CJK Unicode block is NOT all Japanese and Korean characters, | |
# despite its name. The modern Korean Hangul alphabet is a different block, | |
# as is Japanese Hiragana and Katakana. Those alphabets are used to write | |
# space-separated words, so they are not treated specially and handled | |
# like the all of the other languages. | |
if ( | |
(0x4E00 <= cp <= 0x9FFF) | |
or (0x3400 <= cp <= 0x4DBF) # | |
or (0x20000 <= cp <= 0x2A6DF) # | |
or (0x2A700 <= cp <= 0x2B73F) # | |
or (0x2B740 <= cp <= 0x2B81F) # | |
or (0x2B820 <= cp <= 0x2CEAF) # | |
or (0xF900 <= cp <= 0xFAFF) | |
or (0x2F800 <= cp <= 0x2FA1F) # | |
): # | |
return True | |
return False | |
def is_chinese(word: str): | |
# word like "180" or "身高" or "神" | |
for char in word: | |
char = ord(char) | |
if not is_chinese_char(char): | |
return 0 | |
return 1 | |