File size: 3,753 Bytes
9cba22c
 
 
 
 
 
 
 
 
 
 
 
 
8bc2fc9
22fe41e
 
 
41c7a59
cfd6ece
41c7a59
22fe41e
 
41c7a59
0404a52
 
 
41c7a59
 
0404a52
 
 
41c7a59
 
 
0404a52
41c7a59
 
 
 
22fe41e
41c7a59
 
 
 
 
 
0404a52
 
cfd6ece
41c7a59
 
0404a52
 
 
 
 
 
 
 
 
 
 
 
41c7a59
1b2aab6
0404a52
41c7a59
 
0404a52
 
 
 
41c7a59
 
 
0404a52
 
41c7a59
0404a52
 
 
41c7a59
 
 
 
 
 
 
0404a52
41c7a59
0404a52
41c7a59
22fe41e
8bc2fc9
0404a52
 
41c7a59
 
 
0404a52
 
41c7a59
 
 
 
0404a52
 
 
 
41c7a59
 
0404a52
41c7a59
 
 
 
 
 
0404a52
 
 
 
 
41c7a59
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import logging
import re
import json
import os
import pandas as pd
from rag.nlp import rag_tokenizer
from . import regions


current_file_path = os.path.dirname(os.path.abspath(__file__))
GOODS = pd.read_csv(
    os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0
).fillna(0)
GOODS["cid"] = GOODS["cid"].astype(str)
GOODS = GOODS.set_index(["cid"])
CORP_TKS = json.load(
    open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r")
)
GOOD_CORP = json.load(open(os.path.join(current_file_path, "res/good_corp.json"), "r"))
CORP_TAG = json.load(open(os.path.join(current_file_path, "res/corp_tag.json"), "r"))


def baike(cid, default_v=0):
    global GOODS
    try:
        return GOODS.loc[str(cid), "len"]
    except Exception:
        pass
    return default_v


def corpNorm(nm, add_region=True):
    global CORP_TKS
    if not nm or isinstance(nm, str):
        return ""
    nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
    nm = re.sub(r"&", "&", nm)
    nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
    nm = re.sub(
        r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE
    )
    nm = re.sub(
        r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$",
        "",
        nm,
        10000,
        re.IGNORECASE,
    )
    if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])):
        return nm

    tks = rag_tokenizer.tokenize(nm).split()
    reg = [t for i, t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
    nm = ""
    for t in tks:
        if regions.isName(t) or t in CORP_TKS:
            continue
        if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):
            nm += " "
        nm += t

    r = re.search(r"^([^a-z0-9 \(\)&]{2,})[a-z ]{4,}$", nm.strip())
    if r:
        nm = r.group(1)
    r = re.search(r"^([a-z ]{3,})[^a-z0-9 \(\)&]{2,}$", nm.strip())
    if r:
        nm = r.group(1)
    return nm.strip() + (("" if not reg else "(%s)" % reg[0]) if add_region else "")


def rmNoise(n):
    n = re.sub(r"[\((][^()()]+[))]", "", n)
    n = re.sub(r"[,. &()()]+", "", n)
    return n


GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
for c, v in CORP_TAG.items():
    cc = corpNorm(rmNoise(c), False)
    if not cc:
        logging.debug(c)
CORP_TAG = {corpNorm(rmNoise(c), False): v for c, v in CORP_TAG.items()}


def is_good(nm):
    global GOOD_CORP
    if nm.find("外派") >= 0:
        return False
    nm = rmNoise(nm)
    nm = corpNorm(nm, False)
    for n in GOOD_CORP:
        if re.match(r"[0-9a-zA-Z]+$", n):
            if n == nm:
                return True
        elif nm.find(n) >= 0:
            return True
    return False


def corp_tag(nm):
    global CORP_TAG
    nm = rmNoise(nm)
    nm = corpNorm(nm, False)
    for n in CORP_TAG.keys():
        if re.match(r"[0-9a-zA-Z., ]+$", n):
            if n == nm:
                return CORP_TAG[n]
        elif nm.find(n) >= 0:
            if len(n) < 3 and len(nm) / len(n) >= 2:
                continue
            return CORP_TAG[n]
    return []