[#1] fetch_alpha implemented
Browse files- explore/explore_fetch_alpha.py +10 -0
- explore/explore_fetch_alpha_predict.py +19 -0
- explore/explore_fetch_epie.py +0 -27
- idiomify/fetchers.py +17 -2
- idiomify/paths.py +0 -4
explore/explore_fetch_alpha.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from idiomify.fetchers import fetch_alpha
|
2 |
+
|
3 |
+
|
4 |
+
def main():
|
5 |
+
model = fetch_alpha("overfit")
|
6 |
+
print(model.bart.config)
|
7 |
+
|
8 |
+
|
9 |
+
if __name__ == '__main__':
|
10 |
+
main()
|
explore/explore_fetch_alpha_predict.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import BartTokenizer
|
2 |
+
from builders import SourcesBuilder
|
3 |
+
from fetchers import fetch_alpha
|
4 |
+
|
5 |
+
|
6 |
+
def main():
|
7 |
+
model = fetch_alpha("overfit")
|
8 |
+
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
|
9 |
+
lit2idi = [
|
10 |
+
("my man", ""),
|
11 |
+
("hello", "")
|
12 |
+
] # just some dummy stuff
|
13 |
+
srcs = SourcesBuilder(tokenizer)(lit2idi)
|
14 |
+
out = model.predict(srcs=srcs)
|
15 |
+
print(out)
|
16 |
+
|
17 |
+
|
18 |
+
if __name__ == '__main__':
|
19 |
+
main()
|
explore/explore_fetch_epie.py
DELETED
@@ -1,27 +0,0 @@
|
|
1 |
-
|
2 |
-
from idiomify.fetchers import fetch_epie
|
3 |
-
|
4 |
-
|
5 |
-
def main():
|
6 |
-
epie = fetch_epie()
|
7 |
-
idioms = set([
|
8 |
-
idiom
|
9 |
-
for idiom, _, _ in epie
|
10 |
-
])
|
11 |
-
|
12 |
-
# so, what do you want? you want to build an idiom-masked language modeling?
|
13 |
-
for idiom, context, tag in epie:
|
14 |
-
print(idiom, context)
|
15 |
-
|
16 |
-
for idx, idiom in enumerate(idioms):
|
17 |
-
print(idx, idiom)
|
18 |
-
|
19 |
-
# isn't it better to just leave the idiom there, and have it guess what meaning it has?
|
20 |
-
# in that case, It may be better to use a generative model?
|
21 |
-
# but what would happen if you let it... just guess it?
|
22 |
-
# the problem with non-masking is that ... you give the model the answer.
|
23 |
-
# what you should rather do is... do something like... find similar words.
|
24 |
-
|
25 |
-
|
26 |
-
if __name__ == '__main__':
|
27 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
idiomify/fetchers.py
CHANGED
@@ -5,7 +5,7 @@ import wandb
|
|
5 |
import requests
|
6 |
from typing import Tuple, List
|
7 |
from wandb.sdk.wandb_run import Run
|
8 |
-
from idiomify.paths import CONFIG_YAML, idioms_dir, literal2idiomatic
|
9 |
from idiomify.urls import (
|
10 |
EPIE_IMMUTABLE_IDIOMS_URL,
|
11 |
EPIE_IMMUTABLE_IDIOMS_CONTEXTS_URL,
|
@@ -15,9 +15,10 @@ from idiomify.urls import (
|
|
15 |
EPIE_MUTABLE_IDIOMS_TAGS_URL,
|
16 |
PIE_URL
|
17 |
)
|
|
|
|
|
18 |
|
19 |
|
20 |
-
# sources for dataset
|
21 |
def fetch_epie(ver: str) -> List[Tuple[str, str, str]]:
|
22 |
"""
|
23 |
It fetches the EPIE idioms, contexts, and tags from the web
|
@@ -85,6 +86,20 @@ def fetch_literal2idiomatic(ver: str, run: Run = None) -> List[Tuple[str, str]]:
|
|
85 |
return [(row[0], row[1]) for row in reader]
|
86 |
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
def fetch_config() -> dict:
|
89 |
with open(str(CONFIG_YAML), 'r', encoding="utf-8") as fh:
|
90 |
return yaml.safe_load(fh)
|
|
|
5 |
import requests
|
6 |
from typing import Tuple, List
|
7 |
from wandb.sdk.wandb_run import Run
|
8 |
+
from idiomify.paths import CONFIG_YAML, idioms_dir, literal2idiomatic, alpha_dir
|
9 |
from idiomify.urls import (
|
10 |
EPIE_IMMUTABLE_IDIOMS_URL,
|
11 |
EPIE_IMMUTABLE_IDIOMS_CONTEXTS_URL,
|
|
|
15 |
EPIE_MUTABLE_IDIOMS_TAGS_URL,
|
16 |
PIE_URL
|
17 |
)
|
18 |
+
from transformers import AutoModelForSeq2SeqLM, AutoConfig
|
19 |
+
from models import Alpha
|
20 |
|
21 |
|
|
|
22 |
def fetch_epie(ver: str) -> List[Tuple[str, str, str]]:
|
23 |
"""
|
24 |
It fetches the EPIE idioms, contexts, and tags from the web
|
|
|
86 |
return [(row[0], row[1]) for row in reader]
|
87 |
|
88 |
|
89 |
+
def fetch_alpha(ver: str, run: Run = None) -> Alpha:
|
90 |
+
if run:
|
91 |
+
artifact = run.use_artifact(f"alpha:{ver}", type="model")
|
92 |
+
else:
|
93 |
+
artifact = wandb.Api().artifact(f"eubinecto/idiomify/alpha:{ver}", type="model")
|
94 |
+
config = artifact.metadata
|
95 |
+
artifact_dir = artifact.download(root=alpha_dir(ver))
|
96 |
+
ckpt_path = path.join(artifact_dir, "model.ckpt")
|
97 |
+
bart = AutoModelForSeq2SeqLM.from_config(AutoConfig.from_pretrained(config['bart']))
|
98 |
+
with open(ckpt_path, 'r') as fh:
|
99 |
+
alpha = Alpha.load_from_checkpoint(ckpt_path, bart=bart)
|
100 |
+
return alpha
|
101 |
+
|
102 |
+
|
103 |
def fetch_config() -> dict:
|
104 |
with open(str(CONFIG_YAML), 'r', encoding="utf-8") as fh:
|
105 |
return yaml.safe_load(fh)
|
idiomify/paths.py
CHANGED
@@ -15,7 +15,3 @@ def literal2idiomatic(ver: str) -> Path:
|
|
15 |
|
16 |
def alpha_dir(ver: str) -> Path:
|
17 |
return ARTIFACTS_DIR / f"alpha_{ver}"
|
18 |
-
|
19 |
-
|
20 |
-
def gamma_dir(ver: str) -> Path:
|
21 |
-
return ARTIFACTS_DIR / f"beta_{ver}"
|
|
|
15 |
|
16 |
def alpha_dir(ver: str) -> Path:
|
17 |
return ARTIFACTS_DIR / f"alpha_{ver}"
|
|
|
|
|
|
|
|