DeepLearning101's picture
Upload 21 files
45311fe
raw
history blame
1.31 kB
# -*- coding: utf-8 -*-
# @Time : 2021/12/2 5:41 p.m.
# @Author : JianingWang
# @File : common.py
def is_chinese_char(cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if (
(0x4E00 <= cp <= 0x9FFF)
or (0x3400 <= cp <= 0x4DBF) #
or (0x20000 <= cp <= 0x2A6DF) #
or (0x2A700 <= cp <= 0x2B73F) #
or (0x2B740 <= cp <= 0x2B81F) #
or (0x2B820 <= cp <= 0x2CEAF) #
or (0xF900 <= cp <= 0xFAFF)
or (0x2F800 <= cp <= 0x2FA1F) #
): #
return True
return False
def is_chinese(word: str):
# word like "180" or "身高" or "神"
for char in word:
char = ord(char)
if not is_chinese_char(char):
return 0
return 1