|
import re |
|
from spacy.tokens import Doc |
|
from spacy.language import Language |
|
|
|
|
|
@Language.component("preprocess_text") |
|
|
|
def preprocess_text(doc): |
|
|
|
text = doc.text |
|
|
|
|
|
etape1_in = re.compile('\s$', re.MULTILINE) |
|
etape1_out = '' |
|
|
|
|
|
|
|
etape2_in = re.compile('(([a-zà-ÿ]|[A-ZÀ-Ÿ])-)\n') |
|
etape2_out = r'\1' |
|
|
|
|
|
etape3_in = re.compile('\n') |
|
etape3_out = ' ' |
|
|
|
|
|
etape4_in = re.compile('\s{2,}') |
|
etape4_out = ' ' |
|
|
|
|
|
sortie1 = etape1_in.sub(etape1_out, text) |
|
sortie2 = etape2_in.sub(etape2_out, sortie1) |
|
sortie3 = etape3_in.sub(etape3_out, sortie2) |
|
sortie4 = etape4_in.sub(etape4_out, sortie3) |
|
|
|
|
|
modified_doc = Doc(doc.vocab, words=sortie4.split()) |
|
|
|
return modified_doc |
|
|
|
|
|
Language.component("preprocess_text", func=preprocess_text) |