a-menu commited on
Commit
bebc8b9
·
1 Parent(s): 817ffad

Update spaCy pipeline

Browse files
README.md CHANGED
@@ -1,3 +1,37 @@
1
  ---
 
 
 
 
2
  license: cc-by-nc-2.0
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - spacy
4
+ language:
5
+ - fr
6
  license: cc-by-nc-2.0
7
+ model-index:
8
+ - name: fr_arches_sentencizer
9
+ results:
10
+ - task:
11
+ name: SENTS
12
+ type: token-classification
13
+ metrics:
14
+ - name: Sentences F-Score
15
+ type: f_score
16
+ value: 0.4066841152
17
  ---
18
+ | Feature | Description |
19
+ | --- | --- |
20
+ | **Name** | `fr_arches_sentencizer` |
21
+ | **Version** | `0.0.0` |
22
+ | **spaCy** | `>=3.6.1,<3.7.0` |
23
+ | **Default Pipeline** | `preprocess_text`, `senter` |
24
+ | **Components** | `preprocess_text`, `senter` |
25
+ | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
26
+ | **Sources** | n/a |
27
+ | **License** | `cc-by-nc 2.0` |
28
+ | **Author** | [n/a]() |
29
+
30
+ ### Accuracy
31
+
32
+ | Type | Score |
33
+ | --- | --- |
34
+ | `SENTS_F` | 40.67 |
35
+ | `SENTS_P` | 52.91 |
36
+ | `SENTS_R` | 33.03 |
37
+ | `SENTER_LOSS` | 162448.50 |
config.cfg ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [paths]
2
+ train = "drive/MyDrive/ARCHES/TAL/donnees/pretraitement/modele_segmentation/extraction_wavestone/spacy/train.spacy"
3
+ dev = "drive/MyDrive/ARCHES/TAL/donnees/pretraitement/modele_segmentation/extraction_wavestone/spacy/dev.spacy"
4
+ vectors = null
5
+ init_tok2vec = null
6
+
7
+ [system]
8
+ gpu_allocator = "pytorch"
9
+ seed = 0
10
+
11
+ [nlp]
12
+ lang = "fr"
13
+ pipeline = ["preprocess_text","senter"]
14
+ batch_size = 128
15
+ disabled = []
16
+ before_creation = null
17
+ after_creation = null
18
+ after_pipeline_creation = null
19
+ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
+
21
+ [components]
22
+
23
+ [components.preprocess_text]
24
+ factory = "preprocess_text"
25
+
26
+ [components.senter]
27
+ factory = "senter"
28
+ overwrite = false
29
+ scorer = {"@scorers":"spacy.senter_scorer.v1"}
30
+
31
+ [components.senter.model]
32
+ @architectures = "spacy.Tagger.v2"
33
+ nO = null
34
+ normalize = false
35
+
36
+ [components.senter.model.tok2vec]
37
+ @architectures = "spacy.HashEmbedCNN.v2"
38
+ pretrained_vectors = null
39
+ width = 12
40
+ depth = 1
41
+ embed_size = 2000
42
+ window_size = 1
43
+ maxout_pieces = 2
44
+ subword_features = true
45
+
46
+ [corpora]
47
+
48
+ [corpora.dev]
49
+ @readers = "spacy.Corpus.v1"
50
+ path = ${paths.dev}
51
+ max_length = 0
52
+ gold_preproc = false
53
+ limit = 0
54
+ augmenter = null
55
+
56
+ [corpora.train]
57
+ @readers = "spacy.Corpus.v1"
58
+ path = ${paths.train}
59
+ max_length = 0
60
+ gold_preproc = false
61
+ limit = 0
62
+ augmenter = null
63
+
64
+ [training]
65
+ accumulate_gradient = 3
66
+ dev_corpus = "corpora.dev"
67
+ train_corpus = "corpora.train"
68
+ seed = ${system.seed}
69
+ gpu_allocator = ${system.gpu_allocator}
70
+ dropout = 0.1
71
+ patience = 1600
72
+ max_epochs = 0
73
+ max_steps = 20000
74
+ eval_frequency = 200
75
+ frozen_components = []
76
+ annotating_components = []
77
+ before_to_disk = null
78
+ before_update = null
79
+
80
+ [training.batcher]
81
+ @batchers = "spacy.batch_by_padded.v1"
82
+ discard_oversize = true
83
+ size = 2000
84
+ buffer = 256
85
+ get_length = null
86
+
87
+ [training.logger]
88
+ @loggers = "spacy.ConsoleLogger.v1"
89
+ progress_bar = false
90
+
91
+ [training.optimizer]
92
+ @optimizers = "Adam.v1"
93
+ beta1 = 0.9
94
+ beta2 = 0.999
95
+ L2_is_weight_decay = true
96
+ L2 = 0.01
97
+ grad_clip = 1.0
98
+ use_averages = false
99
+ eps = 0.00000001
100
+
101
+ [training.optimizer.learn_rate]
102
+ @schedules = "warmup_linear.v1"
103
+ warmup_steps = 250
104
+ total_steps = 20000
105
+ initial_rate = 0.00005
106
+
107
+ [training.score_weights]
108
+ sents_f = 1.0
109
+ sents_p = 0.0
110
+ sents_r = 0.0
111
+
112
+ [pretraining]
113
+
114
+ [initialize]
115
+ vectors = ${paths.vectors}
116
+ init_tok2vec = ${paths.init_tok2vec}
117
+ vocab_data = null
118
+ lookups = null
119
+ before_init = null
120
+ after_init = null
121
+
122
+ [initialize.components]
123
+
124
+ [initialize.tokenizer]
fr_arches_sentencizer-any-py3-none-any.whl ADDED
Binary file (315 kB). View file
 
meta.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lang":"fr",
3
+ "name":"arches_sentencizer",
4
+ "version":"0.0.0",
5
+ "description":"",
6
+ "author":"",
7
+ "email":"",
8
+ "url":"",
9
+ "license":"cc-by-nc 2.0",
10
+ "spacy_version":">=3.6.1,<3.7.0",
11
+ "spacy_git_version":"458bc5f45",
12
+ "vectors":{
13
+ "width":0,
14
+ "vectors":0,
15
+ "keys":0,
16
+ "name":null
17
+ },
18
+ "labels":{
19
+
20
+ },
21
+ "pipeline":[
22
+ "preprocess_text",
23
+ "senter"
24
+ ],
25
+ "components":[
26
+ "preprocess_text",
27
+ "senter"
28
+ ],
29
+ "disabled":[
30
+
31
+ ],
32
+ "performance":{
33
+ "sents_f":0.4066841152,
34
+ "sents_p":0.529072813,
35
+ "sents_r":0.3302812296,
36
+ "senter_loss":1624.4849906801
37
+ },
38
+ "requirements":[
39
+
40
+ ]
41
+ }
preprocess_text.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from spacy.tokens import Doc
3
+ from spacy.language import Language
4
+
5
+
6
+ @Language.component("preprocess_text")
7
+
8
+ def preprocess_text(doc):
9
+
10
+ text = doc.text
11
+
12
+ #On supprime les espaces présentes en fin de ligne
13
+ etape1_in = re.compile('\s$', re.MULTILINE)
14
+ etape1_out = ''
15
+
16
+ #On supprime les retours à la ligne au niveau des traits d'union au sein d'un mot
17
+ # "gallo-\romain" devient ainsi "gallo-romain"
18
+ etape2_in = re.compile('(([a-zà-ÿ]|[A-ZÀ-Ÿ])-)\n')
19
+ etape2_out = r'\1'
20
+
21
+ #On remplace les retours à la ligne restants par une espace
22
+ etape3_in = re.compile('\n')
23
+ etape3_out = ' '
24
+
25
+ #On remplace les espaces multiples par une espace unique
26
+ etape4_in = re.compile('\s{2,}')
27
+ etape4_out = ' '
28
+
29
+ #On applique les transformations
30
+ sortie1 = etape1_in.sub(etape1_out, text)
31
+ sortie2 = etape2_in.sub(etape2_out, sortie1)
32
+ sortie3 = etape3_in.sub(etape3_out, sortie2)
33
+ sortie4 = etape4_in.sub(etape4_out, sortie3)
34
+
35
+ # On crée un doc avec le texte modifié
36
+ modified_doc = Doc(doc.vocab, words=sortie4.split())
37
+
38
+ return modified_doc
39
+
40
+
41
+ Language.component("preprocess_text", func=preprocess_text)
senter/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "overwrite":false
3
+ }
senter/model ADDED
Binary file (255 kB). View file
 
tokenizer ADDED
The diff for this file is too large to render. See raw diff
 
vocab/key2row ADDED
@@ -0,0 +1 @@
 
 
1
+
vocab/lookups.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
3
+ size 1
vocab/strings.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab/vectors ADDED
Binary file (128 Bytes). View file
 
vocab/vectors.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "mode":"default"
3
+ }