diff --git "a/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/part2.ipynb" "b/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/part2.ipynb" --- "a/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/part2.ipynb" +++ "b/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/part2.ipynb" @@ -1,27 +1,15 @@ { "cells": [ - { - "cell_type": "markdown", - "id": "8f9ba648", - "metadata": { - "tags": [ - "papermill-error-cell-tag" - ] - }, - "source": [ - "An Exception was encountered at 'In [9]'." - ] - }, { "attachments": {}, "cell_type": "markdown", - "id": "0fa80441", + "id": "3174f701", "metadata": { "papermill": { - "duration": 0.005046, - "end_time": "2023-09-06T16:21:38.086651", + "duration": 0.005065, + "end_time": "2023-09-06T17:12:47.606560", "exception": false, - "start_time": "2023-09-06T16:21:38.081605", + "start_time": "2023-09-06T17:12:47.601495", "status": "completed" }, "tags": [] @@ -37,13 +25,13 @@ { "attachments": {}, "cell_type": "markdown", - "id": "bbfc0a03", + "id": "cb5debdd", "metadata": { "papermill": { - "duration": 0.002135, - "end_time": "2023-09-06T16:21:38.092947", + "duration": 0.002115, + "end_time": "2023-09-06T17:12:47.612606", "exception": false, - "start_time": "2023-09-06T16:21:38.090812", + "start_time": "2023-09-06T17:12:47.610491", "status": "completed" }, "tags": [] @@ -55,19 +43,19 @@ { "cell_type": "code", "execution_count": 1, - "id": "1fe0f82c", + "id": "41bbf98d", "metadata": { "execution": { - "iopub.execute_input": "2023-09-06T16:21:38.098818Z", - "iopub.status.busy": "2023-09-06T16:21:38.098531Z", - "iopub.status.idle": "2023-09-06T16:21:38.979393Z", - "shell.execute_reply": "2023-09-06T16:21:38.978561Z" + "iopub.execute_input": "2023-09-06T17:12:47.618377Z", + "iopub.status.busy": "2023-09-06T17:12:47.618157Z", + "iopub.status.idle": "2023-09-06T17:12:48.494513Z", + "shell.execute_reply": "2023-09-06T17:12:48.493600Z" }, "papermill": { - "duration": 0.886067, - "end_time": "2023-09-06T16:21:38.981283", + "duration": 0.881639, + "end_time": "2023-09-06T17:12:48.496472", "exception": false, - "start_time": "2023-09-06T16:21:38.095216", + "start_time": "2023-09-06T17:12:47.614833", "status": "completed" }, "tags": [] @@ -77,10 +65,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "CITATION.cff RWKV-v4wavenet\t RWKV-v5headsize32 checkpoint\tnotebook\r\n", - "LICENSE RWKV-v5\t\t RWKV-v5r2\t datapath\toutput\r\n", - "README.md RWKV-v5altwavenet RWKV-v5rstack\t docker\r\n", - "RWKV-v4neo RWKV-v5headsize2x RWKV-v5wavenet model\r\n" + "CITATION.cff RWKV-v4wavenet\t RWKV-v5headsize2x checkpoint\tnotebook\r\n", + "LICENSE RWKV-v5\t\t RWKV-v5headsize32 datapath\toutput\r\n", + "README.md RWKV-v5-beta2\t RWKV-v5rstack\t docker\r\n", + "RWKV-v4neo RWKV-v5altwavenet RWKV-v5wavenet model\r\n" ] } ], @@ -95,19 +83,19 @@ { "cell_type": "code", "execution_count": 2, - "id": "333141ec", + "id": "bc308e46", "metadata": { "execution": { - "iopub.execute_input": "2023-09-06T16:21:38.991928Z", - "iopub.status.busy": "2023-09-06T16:21:38.991392Z", - "iopub.status.idle": "2023-09-06T16:21:41.124533Z", - "shell.execute_reply": "2023-09-06T16:21:41.123643Z" + "iopub.execute_input": "2023-09-06T17:12:48.506904Z", + "iopub.status.busy": "2023-09-06T17:12:48.506660Z", + "iopub.status.idle": "2023-09-06T17:12:50.610312Z", + "shell.execute_reply": "2023-09-06T17:12:50.609442Z" }, "papermill": { - "duration": 2.140539, - "end_time": "2023-09-06T16:21:41.126606", + "duration": 2.110898, + "end_time": "2023-09-06T17:12:50.612132", "exception": false, - "start_time": "2023-09-06T16:21:38.986067", + "start_time": "2023-09-06T17:12:48.501234", "status": "completed" }, "tags": [] @@ -130,19 +118,19 @@ { "cell_type": "code", "execution_count": 3, - "id": "5bae6c49", + "id": "5ecce62b", "metadata": { "execution": { - "iopub.execute_input": "2023-09-06T16:21:41.136483Z", - "iopub.status.busy": "2023-09-06T16:21:41.136334Z", - "iopub.status.idle": "2023-09-06T16:21:41.142021Z", - "shell.execute_reply": "2023-09-06T16:21:41.141451Z" + "iopub.execute_input": "2023-09-06T17:12:50.622765Z", + "iopub.status.busy": "2023-09-06T17:12:50.622510Z", + "iopub.status.idle": "2023-09-06T17:12:50.631551Z", + "shell.execute_reply": "2023-09-06T17:12:50.630955Z" }, "papermill": { - "duration": 0.011523, - "end_time": "2023-09-06T16:21:41.143031", + "duration": 0.01615, + "end_time": "2023-09-06T17:12:50.633066", "exception": false, - "start_time": "2023-09-06T16:21:41.131508", + "start_time": "2023-09-06T17:12:50.616916", "status": "completed" }, "tags": [] @@ -209,19 +197,19 @@ { "cell_type": "code", "execution_count": 4, - "id": "c0de3166", + "id": "ecee273d", "metadata": { "execution": { - "iopub.execute_input": "2023-09-06T16:21:41.151365Z", - "iopub.status.busy": "2023-09-06T16:21:41.151176Z", - "iopub.status.idle": "2023-09-06T16:27:17.337465Z", - "shell.execute_reply": "2023-09-06T16:27:17.336590Z" + "iopub.execute_input": "2023-09-06T17:12:50.643057Z", + "iopub.status.busy": "2023-09-06T17:12:50.642933Z", + "iopub.status.idle": "2023-09-06T17:13:06.898900Z", + "shell.execute_reply": "2023-09-06T17:13:06.898162Z" }, "papermill": { - "duration": 336.192521, - "end_time": "2023-09-06T16:27:17.339221", + "duration": 16.262552, + "end_time": "2023-09-06T17:13:06.900660", "exception": false, - "start_time": "2023-09-06T16:21:41.146700", + "start_time": "2023-09-06T17:12:50.638108", "status": "completed" }, "tags": [] @@ -231,9 +219,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "--2023-09-06 16:21:41-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/v5r3-L12-D2048-E0_1-enwiki-4k.pth\r\n", - "Resolving huggingface.co (huggingface.co)... 13.33.33.102, 13.33.33.110, 13.33.33.20, ...\r\n", - "Connecting to huggingface.co (huggingface.co)|13.33.33.102|:443... connected.\r\n", + "--2023-09-06 17:12:50-- https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/v5r3-L12-D2048-E0_1-enwiki-4k.pth\r\n", + "Resolving huggingface.co (huggingface.co)... 13.33.33.55, 13.33.33.102, 13.33.33.110, ...\r\n", + "Connecting to huggingface.co (huggingface.co)|13.33.33.55|:443... connected.\r\n", "HTTP request sent, awaiting response... " ] }, @@ -242,8 +230,8 @@ "output_type": "stream", "text": [ "302 Found\r\n", - "Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/fcd2c54e435c74dc2a43bd3bbde6594de9c6937156caf9f72a77137ed3d49539?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5r3-L12-D2048-E0_1-enwiki-4k.pth%3B+filename%3D%22v5r3-L12-D2048-E0_1-enwiki-4k.pth%22%3B&Expires=1694276501&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NDI3NjUwMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkL2ZjZDJjNTRlNDM1Yzc0ZGMyYTQzYmQzYmJkZTY1OTRkZTljNjkzNzE1NmNhZjlmNzJhNzcxMzdlZDNkNDk1Mzk%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=CI61yjWgEWnbmEXPc-wPNVHfdsEO2SQfSDn7vOjQ7Hh%7EdsoGq591AfvsbeKBNfhOGa5jqdGrS-L83TUjfWEygfteOhNTryi-noWjp6Su6U0Cy0NV742DJDSjTPuCEpiIg1kkKP%7EI5qaveJTJrUQQLahYV20Djlh%7EkLRgNGLplnHLyIJUC5hRX8eyxEdJQnt8nC95k7cKLKz-yWIXtsdoHj%7EZcVToiGG3IFw%7EdNAjm1ns8WTbdW2v8si4lEZd9WCGf0Brx77zxWrFP1Ot9U5fk4AbQzBnmLE-xOLe0w4%7EhSY%7EburifbyLbNO-f-GEY6UFaFni8ILeziJIkPOJPb8mJA__&Key-Pair-Id=KVTP0A1DKRTAX [following]\r\n", - "--2023-09-06 16:21:41-- https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/fcd2c54e435c74dc2a43bd3bbde6594de9c6937156caf9f72a77137ed3d49539?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5r3-L12-D2048-E0_1-enwiki-4k.pth%3B+filename%3D%22v5r3-L12-D2048-E0_1-enwiki-4k.pth%22%3B&Expires=1694276501&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NDI3NjUwMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkL2ZjZDJjNTRlNDM1Yzc0ZGMyYTQzYmQzYmJkZTY1OTRkZTljNjkzNzE1NmNhZjlmNzJhNzcxMzdlZDNkNDk1Mzk%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=CI61yjWgEWnbmEXPc-wPNVHfdsEO2SQfSDn7vOjQ7Hh%7EdsoGq591AfvsbeKBNfhOGa5jqdGrS-L83TUjfWEygfteOhNTryi-noWjp6Su6U0Cy0NV742DJDSjTPuCEpiIg1kkKP%7EI5qaveJTJrUQQLahYV20Djlh%7EkLRgNGLplnHLyIJUC5hRX8eyxEdJQnt8nC95k7cKLKz-yWIXtsdoHj%7EZcVToiGG3IFw%7EdNAjm1ns8WTbdW2v8si4lEZd9WCGf0Brx77zxWrFP1Ot9U5fk4AbQzBnmLE-xOLe0w4%7EhSY%7EburifbyLbNO-f-GEY6UFaFni8ILeziJIkPOJPb8mJA__&Key-Pair-Id=KVTP0A1DKRTAX\r\n", + "Location: https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/fcd2c54e435c74dc2a43bd3bbde6594de9c6937156caf9f72a77137ed3d49539?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5r3-L12-D2048-E0_1-enwiki-4k.pth%3B+filename%3D%22v5r3-L12-D2048-E0_1-enwiki-4k.pth%22%3B&Expires=1694279570&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NDI3OTU3MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkL2ZjZDJjNTRlNDM1Yzc0ZGMyYTQzYmQzYmJkZTY1OTRkZTljNjkzNzE1NmNhZjlmNzJhNzcxMzdlZDNkNDk1Mzk%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=vlJUg9zPT-Ie2MebXI3t7Sfovkvk30xvNya0WqXvAogwISiGWpmGNd3IKa0rDNdEdrQ3uREbJSFhcam12E5VepvwzlhCsUFsI4W9YnOQ8JOVAtNH5fzk16zGizK7%7EtmvJszRMbwukNZOp6TGz4kqEQPgwAwv26tPs9mP2ATP59hiH30jVnK1yjYot7Y2UAC6vKBdF3%7E%7EZUsL-ZfcYL0lTLE7xPmtgafMs3DM-TJhA1wPXw2r-ByBDo2l6edDKcosW36ncjch5kT5XXrnmxEhX4Yll0kAYuwvfXZI2AsIfeopfeKyYhg0KKeAwrPaxHzAcfQSHQn%7EVIjtW-Ro-8XAUw__&Key-Pair-Id=KVTP0A1DKRTAX [following]\r\n", + "--2023-09-06 17:12:51-- https://cdn-lfs.huggingface.co/repos/2e/f7/2ef78555202aa92abdbdf476ce3d0fd5a8b15f7245edf0b80d4d30572355f30d/fcd2c54e435c74dc2a43bd3bbde6594de9c6937156caf9f72a77137ed3d49539?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27v5r3-L12-D2048-E0_1-enwiki-4k.pth%3B+filename%3D%22v5r3-L12-D2048-E0_1-enwiki-4k.pth%22%3B&Expires=1694279570&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5NDI3OTU3MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yZS9mNy8yZWY3ODU1NTIwMmFhOTJhYmRiZGY0NzZjZTNkMGZkNWE4YjE1ZjcyNDVlZGYwYjgwZDRkMzA1NzIzNTVmMzBkL2ZjZDJjNTRlNDM1Yzc0ZGMyYTQzYmQzYmJkZTY1OTRkZTljNjkzNzE1NmNhZjlmNzJhNzcxMzdlZDNkNDk1Mzk%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=vlJUg9zPT-Ie2MebXI3t7Sfovkvk30xvNya0WqXvAogwISiGWpmGNd3IKa0rDNdEdrQ3uREbJSFhcam12E5VepvwzlhCsUFsI4W9YnOQ8JOVAtNH5fzk16zGizK7%7EtmvJszRMbwukNZOp6TGz4kqEQPgwAwv26tPs9mP2ATP59hiH30jVnK1yjYot7Y2UAC6vKBdF3%7E%7EZUsL-ZfcYL0lTLE7xPmtgafMs3DM-TJhA1wPXw2r-ByBDo2l6edDKcosW36ncjch5kT5XXrnmxEhX4Yll0kAYuwvfXZI2AsIfeopfeKyYhg0KKeAwrPaxHzAcfQSHQn%7EVIjtW-Ro-8XAUw__&Key-Pair-Id=KVTP0A1DKRTAX\r\n", "Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... " ] }, @@ -251,21 +239,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "18.155.68.128, 18.155.68.94, 18.155.68.73, ...\r\n", - "Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|18.155.68.128|:443... connected.\r\n", - "HTTP request sent, awaiting response... " + "18.155.68.98, 18.155.68.128, 18.155.68.94, ...\r\n", + "Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|18.155.68.98|:443... connected.\r\n", + "HTTP request sent, awaiting response... 200 OK\r\n", + "Length: 1721187013 (1.6G) [binary/octet-stream]\r\n", + "Saving to: ‘v5r3-L12-D2048-E0_1-enwiki-4k.pth’\r\n", + "\r\n", + "\r", + " v5r3-L12- 0%[ ] 0 --.-KB/s " ] }, { "name": "stdout", "output_type": "stream", "text": [ - "200 OK\r\n", - "Length: 1721187013 (1.6G) [binary/octet-stream]\r\n", - "Saving to: ‘v5r3-L12-D2048-E0_1-enwiki-4k.pth’\r\n", - "\r\n", "\r", - " v5r3-L12- 0%[ ] 0 --.-KB/s " + " v5r3-L12-D 1%[ ] 21.42M 107MB/s " ] }, { @@ -273,7 +262,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 0%[ ] 24.35K 99.2KB/s " + " v5r3-L12-D2 2%[ ] 43.83M 110MB/s " ] }, { @@ -281,7 +270,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 0%[ ] 58.35K 119KB/s " + " v5r3-L12-D20 4%[ ] 66.17M 110MB/s " ] }, { @@ -289,7 +278,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 0%[ ] 147.29K 200KB/s " + " v5r3-L12-D204 5%[> ] 88.57M 111MB/s " ] }, { @@ -297,7 +286,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 0%[ ] 314.86K 320KB/s " + " v5r3-L12-D2048 6%[> ] 110.92M 111MB/s " ] }, { @@ -305,7 +294,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 0%[ ] 517.01K 421KB/s " + " v5r3-L12-D2048- 8%[> ] 133.36M 111MB/s " ] }, { @@ -313,7 +302,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 0%[ ] 727.37K 493KB/s " + " v5r3-L12-D2048-E 9%[> ] 155.76M 111MB/s " ] }, { @@ -321,7 +310,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 0%[ ] 935.73K 544KB/s " + " v5r3-L12-D2048-E0 10%[=> ] 178.16M 111MB/s " ] }, { @@ -329,7 +318,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 0%[ ] 1.13M 589KB/s " + " v5r3-L12-D2048-E0_ 12%[=> ] 200.56M 111MB/s " ] }, { @@ -337,7 +326,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 0%[ ] 1.37M 633KB/s " + "v5r3-L12-D2048-E0_1 13%[=> ] 222.97M 111MB/s " ] }, { @@ -345,7 +334,7 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 0%[ ] 1.62M 673KB/s " + "5r3-L12-D2048-E0_1- 14%[=> ] 244.89M 111MB/s " ] }, { @@ -353,7 +342,7 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 0%[ ] 1.87M 716KB/s " + "r3-L12-D2048-E0_1-e 16%[==> ] 267.32M 111MB/s " ] }, { @@ -361,7 +350,7 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 0%[ ] 2.15M 744KB/s " + "3-L12-D2048-E0_1-en 17%[==> ] 289.71M 111MB/s " ] }, { @@ -369,7 +358,7 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 0%[ ] 2.43M 778KB/s eta 35m 56s" + "-L12-D2048-E0_1-enw 19%[==> ] 312.11M 111MB/s " ] }, { @@ -377,7 +366,7 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 0%[ ] 2.73M 812KB/s eta 35m 56s" + "L12-D2048-E0_1-enwi 20%[===> ] 334.51M 111MB/s eta 12s " ] }, { @@ -385,7 +374,7 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 0%[ ] 3.04M 845KB/s eta 35m 56s" + "12-D2048-E0_1-enwik 21%[===> ] 356.91M 112MB/s eta 12s " ] }, { @@ -393,7 +382,7 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 0%[ ] 3.38M 878KB/s eta 35m 56s" + "2-D2048-E0_1-enwiki 23%[===> ] 379.31M 112MB/s eta 12s " ] }, { @@ -401,7 +390,7 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 0%[ ] 3.67M 850KB/s eta 32m 53s" + "-D2048-E0_1-enwiki- 24%[===> ] 401.72M 112MB/s eta 12s " ] }, { @@ -409,7 +398,7 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 0%[ ] 4.29M 937KB/s eta 32m 53s" + "D2048-E0_1-enwiki-4 25%[====> ] 424.13M 112MB/s eta 12s " ] }, { @@ -417,7 +406,7 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 0%[ ] 4.55M 948KB/s eta 32m 53s" + "2048-E0_1-enwiki-4k 27%[====> ] 446.53M 112MB/s eta 11s " ] }, { @@ -425,7 +414,7 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 0%[ ] 4.84M 959KB/s eta 32m 53s" + "048-E0_1-enwiki-4k. 28%[====> ] 468.94M 112MB/s eta 11s " ] }, { @@ -433,7 +422,7 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 0%[ ] 5.13M 1012KB/s eta 32m 53s" + "48-E0_1-enwiki-4k.p 29%[====> ] 491.34M 112MB/s eta 11s " ] }, { @@ -441,7 +430,7 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 0%[ ] 5.42M 1.04MB/s eta 28m 28s" + "8-E0_1-enwiki-4k.pt 31%[=====> ] 513.75M 112MB/s eta 11s " ] }, { @@ -449,7 +438,7 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 0%[ ] 5.73M 1.08MB/s eta 28m 28s" + "-E0_1-enwiki-4k.pth 32%[=====> ] 536.15M 112MB/s eta 11s " ] }, { @@ -457,7 +446,7 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 0%[ ] 6.05M 1.11MB/s eta 28m 28s" + "E0_1-enwiki-4k.pth 34%[=====> ] 558.51M 112MB/s eta 10s " ] }, { @@ -465,7 +454,7 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 0%[ ] 6.36M 1.13MB/s eta 28m 28s" + "0_1-enwiki-4k.pth 35%[======> ] 580.91M 112MB/s eta 10s " ] }, { @@ -473,7 +462,7 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 0%[ ] 6.70M 1.16MB/s eta 28m 28s" + "_1-enwiki-4k.pth 36%[======> ] 603.30M 112MB/s eta 10s " ] }, { @@ -481,7 +470,7 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 0%[ ] 7.03M 1.18MB/s eta 26m 41s" + "1-enwiki-4k.pth 38%[======> ] 625.71M 112MB/s eta 10s " ] }, { @@ -489,7 +478,7 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 0%[ ] 7.38M 1.21MB/s eta 26m 41s" + "-enwiki-4k.pth 39%[======> ] 648.10M 112MB/s eta 10s " ] }, { @@ -497,7 +486,7 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 0%[ ] 7.73M 1.23MB/s eta 26m 41s" + "enwiki-4k.pth 40%[=======> ] 670.50M 112MB/s eta 9s " ] }, { @@ -505,7 +494,7 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 0%[ ] 8.07M 1.25MB/s eta 26m 41s" + "nwiki-4k.pth 42%[=======> ] 692.78M 112MB/s eta 9s " ] }, { @@ -513,7 +502,7 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 0%[ ] 8.42M 1.26MB/s eta 26m 41s" + "wiki-4k.pth 43%[=======> ] 715.16M 112MB/s eta 9s " ] }, { @@ -521,7 +510,7 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 0%[ ] 8.77M 1.29MB/s eta 25m 10s" + "iki-4k.pth 44%[=======> ] 737.57M 112MB/s eta 9s " ] }, { @@ -529,7 +518,7 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 0%[ ] 9.13M 1.31MB/s eta 25m 10s" + "ki-4k.pth 46%[========> ] 759.99M 112MB/s eta 9s " ] }, { @@ -537,7 +526,7 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 0%[ ] 9.49M 1.31MB/s eta 25m 10s" + "i-4k.pth 47%[========> ] 782.39M 112MB/s eta 8s " ] }, { @@ -545,7 +534,7 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 0%[ ] 9.85M 1.40MB/s eta 25m 10s" + "-4k.pth 49%[========> ] 804.80M 112MB/s eta 8s " ] }, { @@ -553,7 +542,7 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 0%[ ] 10.20M 1.34MB/s eta 25m 10s" + "4k.pth 50%[=========> ] 827.18M 112MB/s eta 8s " ] }, { @@ -561,7 +550,7 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 0%[ ] 10.56M 1.36MB/s eta 24m 2s " + "k.pth 51%[=========> ] 849.60M 112MB/s eta 8s " ] }, { @@ -569,7 +558,7 @@ "output_type": "stream", "text": [ "\r", - "k.pth 0%[ ] 10.93M 1.38MB/s eta 24m 2s " + ".pth 53%[=========> ] 872.00M 112MB/s eta 8s " ] }, { @@ -577,7 +566,7 @@ "output_type": "stream", "text": [ "\r", - ".pth 0%[ ] 11.29M 1.37MB/s eta 24m 2s " + "pth 54%[=========> ] 894.41M 112MB/s eta 7s " ] }, { @@ -585,7 +574,7 @@ "output_type": "stream", "text": [ "\r", - "pth 0%[ ] 11.64M 1.41MB/s eta 24m 2s " + "th 55%[==========> ] 916.82M 112MB/s eta 7s " ] }, { @@ -593,7 +582,7 @@ "output_type": "stream", "text": [ "\r", - "th 0%[ ] 12.01M 1.43MB/s eta 24m 2s " + "h 57%[==========> ] 939.22M 112MB/s eta 7s " ] }, { @@ -601,7 +590,7 @@ "output_type": "stream", "text": [ "\r", - "h 0%[ ] 12.37M 1.44MB/s eta 23m 12s" + " 58%[==========> ] 961.63M 112MB/s eta 7s " ] }, { @@ -609,7 +598,7 @@ "output_type": "stream", "text": [ "\r", - " 0%[ ] 12.74M 1.45MB/s eta 23m 12s" + " v 59%[==========> ] 984.03M 112MB/s eta 7s " ] }, { @@ -617,7 +606,7 @@ "output_type": "stream", "text": [ "\r", - " v 0%[ ] 13.10M 1.45MB/s eta 23m 12s" + " v5 61%[===========> ] 1006M 112MB/s eta 6s " ] }, { @@ -625,7 +614,7 @@ "output_type": "stream", "text": [ "\r", - " v5 0%[ ] 13.45M 1.43MB/s eta 23m 12s" + " v5r 62%[===========> ] 1.00G 112MB/s eta 6s " ] }, { @@ -633,7 +622,7 @@ "output_type": "stream", "text": [ "\r", - " v5r 0%[ ] 13.82M 1.43MB/s eta 23m 12s" + " v5r3 64%[===========> ] 1.03G 112MB/s eta 6s " ] }, { @@ -641,7 +630,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3 0%[ ] 14.18M 1.42MB/s eta 22m 34s" + " v5r3- 65%[============> ] 1.05G 112MB/s eta 6s " ] }, { @@ -649,7 +638,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 0%[ ] 14.55M 1.47MB/s eta 22m 34s" + " v5r3-L 66%[============> ] 1.07G 112MB/s eta 6s " ] }, { @@ -657,7 +646,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 0%[ ] 14.91M 1.47MB/s eta 22m 34s" + " v5r3-L1 68%[============> ] 1.09G 112MB/s eta 5s " ] }, { @@ -665,7 +654,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 0%[ ] 15.28M 1.43MB/s eta 22m 34s" + " v5r3-L12 68%[============> ] 1.09G 105MB/s eta 5s " ] }, { @@ -673,7 +662,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 0%[ ] 15.64M 1.48MB/s eta 22m 34s" + " v5r3-L12- 69%[============> ] 1.11G 103MB/s eta 5s " ] }, { @@ -681,7 +670,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 0%[ ] 16.01M 1.47MB/s eta 22m 3s " + " v5r3-L12-D 70%[=============> ] 1.13G 101MB/s eta 5s " ] }, { @@ -689,7 +678,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 0%[ ] 16.39M 1.48MB/s eta 22m 3s " + " v5r3-L12-D2 71%[=============> ] 1.14G 98.7MB/s eta 5s " ] }, { @@ -697,7 +686,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 1%[ ] 16.77M 1.48MB/s eta 22m 3s " + " v5r3-L12-D20 72%[=============> ] 1.16G 96.7MB/s eta 4s " ] }, { @@ -705,7 +694,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 1%[ ] 17.12M 1.47MB/s eta 22m 3s " + " v5r3-L12-D204 73%[=============> ] 1.17G 93.8MB/s eta 4s " ] }, { @@ -713,7 +702,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 1%[ ] 17.50M 1.48MB/s eta 22m 3s " + " v5r3-L12-D2048 74%[=============> ] 1.19G 93.4MB/s eta 4s " ] }, { @@ -721,7 +710,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 1%[ ] 17.89M 1.53MB/s eta 21m 27s" + " v5r3-L12-D2048- 75%[==============> ] 1.21G 91.2MB/s eta 4s " ] }, { @@ -729,7 +718,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 1%[ ] 18.28M 1.53MB/s eta 21m 27s" + " v5r3-L12-D2048-E 76%[==============> ] 1.22G 89.5MB/s eta 4s " ] }, { @@ -737,7 +726,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 1%[ ] 18.68M 1.56MB/s eta 21m 27s" + " v5r3-L12-D2048-E0 77%[==============> ] 1.24G 89.3MB/s eta 3s " ] }, { @@ -745,7 +734,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 1%[ ] 19.10M 1.41MB/s eta 21m 31s" + " v5r3-L12-D2048-E0_ 78%[==============> ] 1.26G 85.2MB/s eta 3s " ] }, { @@ -753,7 +742,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 1%[ ] 19.81M 1.55MB/s eta 21m 31s" + "v5r3-L12-D2048-E0_1 79%[==============> ] 1.28G 85.1MB/s eta 3s " ] }, { @@ -761,7 +750,7 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 1%[ ] 20.13M 1.54MB/s eta 21m 31s" + "5r3-L12-D2048-E0_1- 81%[===============> ] 1.30G 85.5MB/s eta 3s " ] }, { @@ -769,7 +758,7 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 1%[ ] 20.44M 1.53MB/s eta 21m 31s" + "r3-L12-D2048-E0_1-e 82%[===============> ] 1.32G 84.6MB/s eta 3s " ] }, { @@ -777,7 +766,7 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 1%[ ] 20.77M 1.52MB/s eta 21m 31s" + "3-L12-D2048-E0_1-en 83%[===============> ] 1.34G 85.0MB/s eta 3s " ] }, { @@ -785,7 +774,7 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 1%[ ] 21.12M 1.49MB/s eta 20m 57s" + "-L12-D2048-E0_1-enw 85%[================> ] 1.37G 86.9MB/s eta 3s " ] }, { @@ -793,7 +782,7 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 1%[ ] 21.47M 1.47MB/s eta 20m 57s" + "L12-D2048-E0_1-enwi 86%[================> ] 1.39G 92.5MB/s eta 3s " ] }, { @@ -801,7 +790,7 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 1%[ ] 21.84M 1.51MB/s eta 20m 57s" + "12-D2048-E0_1-enwik 87%[================> ] 1.41G 94.5MB/s eta 3s " ] }, { @@ -809,7 +798,7 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 1%[ ] 22.22M 1.51MB/s eta 20m 57s" + "2-D2048-E0_1-enwiki 89%[================> ] 1.43G 96.2MB/s eta 3s " ] }, { @@ -817,7 +806,7 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 1%[ ] 22.60M 1.49MB/s eta 20m 57s" + "-D2048-E0_1-enwiki- 90%[=================> ] 1.45G 99.5MB/s eta 1s " ] }, { @@ -825,7 +814,7 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 1%[ ] 22.98M 1.48MB/s eta 20m 41s" + "D2048-E0_1-enwiki-4 92%[=================> ] 1.47G 101MB/s eta 1s " ] }, { @@ -833,7 +822,7 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 1%[ ] 23.38M 1.53MB/s eta 20m 41s" + "2048-E0_1-enwiki-4k 93%[=================> ] 1.50G 104MB/s eta 1s " ] }, { @@ -841,7 +830,7 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 1%[ ] 23.78M 1.53MB/s eta 20m 41s" + "048-E0_1-enwiki-4k. 94%[=================> ] 1.52G 105MB/s eta 1s " ] }, { @@ -849,7 +838,7 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 1%[ ] 24.18M 1.51MB/s eta 20m 41s" + "48-E0_1-enwiki-4k.p 96%[==================> ] 1.54G 107MB/s eta 1s " ] }, { @@ -857,7 +846,7 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 1%[ ] 24.58M 1.53MB/s eta 20m 41s" + "8-E0_1-enwiki-4k.pt 97%[==================> ] 1.56G 107MB/s eta 0s " ] }, { @@ -865,7 +854,7 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 1%[ ] 24.99M 1.52MB/s eta 20m 19s" + "-E0_1-enwiki-4k.pth 98%[==================> ] 1.58G 111MB/s eta 0s " ] }, { @@ -873,23 +862,77 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 1%[ ] 25.41M 1.53MB/s eta 20m 19s" + "v5r3-L12-D2048-E0_1 100%[===================>] 1.60G 112MB/s in 15s \r\n", + "\r\n", + "2023-09-06 17:13:06 (106 MB/s) - ‘v5r3-L12-D2048-E0_1-enwiki-4k.pth’ saved [1721187013/1721187013]\r\n", + "\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\r", - "E0_1-enwiki-4k.pth 1%[ ] 25.82M 1.53MB/s eta 20m 19s" + "total 1.5G\r\n", + "drwxr-xr-x 2 root root 3 Sep 6 17:12 .\r\n", + "drwxr-xr-x 20 root root 24 Sep 6 17:12 ..\r\n", + "-rw-r--r-- 1 root root 1.7G Sep 6 15:04 v5r3-L12-D2048-E0_1-enwiki-4k.pth\r\n" ] + } + ], + "source": [ + "# Download the model directly (stop gap till HF sync issues is resolved)\n", + "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", + " wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-r3-memory/{DIR_NAME}/{FILENAME_PREFIX}-enwiki-4k.pth\"\n", + "\n", + "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", + " ls -alh ." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "326602ab", + "metadata": { + "papermill": { + "duration": 0.005225, + "end_time": "2023-09-06T17:13:06.914108", + "exception": false, + "start_time": "2023-09-06T17:13:06.908883", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Enwiki Stage 2 : Basic Instruct Tuning" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e3aa35e9", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:13:06.925901Z", + "iopub.status.busy": "2023-09-06T17:13:06.925655Z", + "iopub.status.idle": "2023-09-06T17:13:14.291842Z", + "shell.execute_reply": "2023-09-06T17:13:14.291053Z" + }, + "papermill": { + "duration": 7.374402, + "end_time": "2023-09-06T17:13:14.293884", + "exception": false, + "start_time": "2023-09-06T17:13:06.919482", + "status": "completed" }, + "tags": [] + }, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 1%[ ] 26.24M 1.52MB/s eta 20m 19s" + "Saving the dataset (0/1 shards): 0%| | 0/14932 [00:00=12.1), as this is known to have freeze issues\r\n", + "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", + "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", + "#\r\n", + "\r\n", + "[RWKV.model] Configuring optimizer with\r\n", + " - lr_init: 4.000e-04 (0.0004)\r\n", + " - lr_final: 3.000e-04 (0.0003)\r\n", + "\r\n", + "Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Detected CUDA files, patching ldflags\r\n", + "Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/fused_adam/build.ninja...\r\n", + "Building extension module fused_adam...\r\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n", + "ninja: no work to do.\r\n", + "Loading extension module fused_adam...\r\n", + "Time to load fused_adam op: 0.06091904640197754 seconds\r\n", + "Loading `train_dataloader` to estimate number of stepping batches.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rank: 0 partition count [1, 1] and sizes[(860549120, False), (768, False)] \r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r\n", + " | Name | Type | Params\r\n", + "--------------------------------------\r\n", + "0 | emb | Embedding | 102 M \r\n", + "1 | blocks | ModuleList | 654 M \r\n", + "2 | ln_out | LayerNorm | 4.1 K \r\n", + "3 | head | Linear | 102 M \r\n", + "--------------------------------------\r\n", + "860 M Trainable params\r\n", + "0 Non-trainable params\r\n", + "860 M Total params\r\n", + "3,442.200 Total estimated model params size (MB)\r\n" ] }, { @@ -985,7 +1149,9 @@ "output_type": "stream", "text": [ "\r", - "k.pth 1%[ ] 31.27M 1.69MB/s eta 19m 24s" + "Training: 0it [00:00, ?it/s]\r", + "Training: 0%| | 0/14932 [00:00 ] 82.33M 2.52MB/s eta 15m 3s " + "Epoch 0: 1%| | 103/14932 [00:20<49:58, 4.94it/s, v_num=o30c, train/loss=3.620\r", + "Epoch 0: 1%| | 103/14932 [00:20<49:58, 4.94it/s, v_num=o30c, train/loss=3.470" ] }, { @@ -1849,7 +2114,8 @@ "output_type": "stream", "text": [ "\r", - "h 5%[> ] 82.94M 2.47MB/s eta 14m 51s" + "Epoch 0: 1%| | 104/14932 [00:20<49:40, 4.97it/s, v_num=o30c, train/loss=3.470\r", + "Epoch 0: 1%| | 104/14932 [00:20<49:40, 4.97it/s, v_num=o30c, train/loss=3.160" ] }, { @@ -1857,7 +2123,8 @@ "output_type": "stream", "text": [ "\r", - " 5%[> ] 83.54M 2.57MB/s eta 14m 51s" + "Epoch 0: 1%| | 105/14932 [00:20<49:22, 5.01it/s, v_num=o30c, train/loss=3.160\r", + "Epoch 0: 1%| | 105/14932 [00:20<49:22, 5.01it/s, v_num=o30c, train/loss=3.610" ] }, { @@ -1865,7 +2132,8 @@ "output_type": "stream", "text": [ "\r", - " v 5%[> ] 84.14M 2.50MB/s eta 14m 51s" + "Epoch 0: 1%| | 106/14932 [00:21<49:04, 5.04it/s, v_num=o30c, train/loss=3.610\r", + "Epoch 0: 1%| | 106/14932 [00:21<49:04, 5.04it/s, v_num=o30c, train/loss=2.500" ] }, { @@ -1873,7 +2141,8 @@ "output_type": "stream", "text": [ "\r", - " v5 5%[> ] 84.75M 2.54MB/s eta 14m 51s" + "Epoch 0: 1%| | 107/14932 [00:21<49:58, 4.94it/s, v_num=o30c, train/loss=2.500\r", + "Epoch 0: 1%| | 107/14932 [00:21<49:58, 4.94it/s, v_num=o30c, train/loss=4.000" ] }, { @@ -1881,7 +2150,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 5%[> ] 85.36M 2.50MB/s eta 14m 51s" + "Epoch 0: 1%| | 108/14932 [00:21<49:56, 4.95it/s, v_num=o30c, train/loss=4.000\r", + "Epoch 0: 1%| | 108/14932 [00:21<49:56, 4.95it/s, v_num=o30c, train/loss=3.830" ] }, { @@ -1889,7 +2159,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 5%[> ] 85.97M 2.54MB/s eta 14m 40s" + "Epoch 0: 1%| | 109/14932 [00:21<49:37, 4.98it/s, v_num=o30c, train/loss=3.830\r", + "Epoch 0: 1%| | 109/14932 [00:21<49:37, 4.98it/s, v_num=o30c, train/loss=4.560" ] }, { @@ -1897,7 +2168,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 5%[> ] 86.58M 2.51MB/s eta 14m 40s" + "Epoch 0: 1%| | 110/14932 [00:21<49:19, 5.01it/s, v_num=o30c, train/loss=4.560\r", + "Epoch 0: 1%| | 110/14932 [00:21<49:19, 5.01it/s, v_num=o30c, train/loss=3.360" ] }, { @@ -1905,7 +2177,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 5%[> ] 87.19M 2.55MB/s eta 14m 40s" + "Epoch 0: 1%| | 111/14932 [00:22<49:02, 5.04it/s, v_num=o30c, train/loss=3.360\r", + "Epoch 0: 1%| | 111/14932 [00:22<49:02, 5.04it/s, v_num=o30c, train/loss=3.190" ] }, { @@ -1913,7 +2186,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 5%[> ] 87.82M 2.51MB/s eta 14m 40s" + "Epoch 0: 1%| | 112/14932 [00:22<48:45, 5.07it/s, v_num=o30c, train/loss=3.190\r", + "Epoch 0: 1%| | 112/14932 [00:22<48:45, 5.07it/s, v_num=o30c, train/loss=2.980" ] }, { @@ -1921,7 +2195,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 5%[> ] 88.43M 2.56MB/s eta 14m 40s" + "Epoch 0: 1%| | 113/14932 [00:22<48:28, 5.10it/s, v_num=o30c, train/loss=2.980\r", + "Epoch 0: 1%| | 113/14932 [00:22<48:28, 5.10it/s, v_num=o30c, train/loss=3.330" ] }, { @@ -1929,7 +2204,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 5%[> ] 89.05M 2.52MB/s eta 14m 29s" + "Epoch 0: 1%| | 114/14932 [00:22<48:11, 5.13it/s, v_num=o30c, train/loss=3.330\r", + "Epoch 0: 1%| | 114/14932 [00:22<48:11, 5.13it/s, v_num=o30c, train/loss=3.310" ] }, { @@ -1937,7 +2213,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 5%[> ] 89.69M 2.57MB/s eta 14m 29s" + "Epoch 0: 1%| | 115/14932 [00:22<47:55, 5.15it/s, v_num=o30c, train/loss=3.310\r", + "Epoch 0: 1%| | 115/14932 [00:22<47:55, 5.15it/s, v_num=o30c, train/loss=3.310" ] }, { @@ -1945,7 +2222,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 5%[> ] 90.32M 2.54MB/s eta 14m 29s" + "Epoch 0: 1%| | 116/14932 [00:22<47:39, 5.18it/s, v_num=o30c, train/loss=3.310\r", + "Epoch 0: 1%| | 116/14932 [00:22<47:39, 5.18it/s, v_num=o30c, train/loss=2.410" ] }, { @@ -1953,7 +2231,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 5%[> ] 90.97M 2.59MB/s eta 14m 29s" + "Epoch 0: 1%| | 117/14932 [00:22<47:24, 5.21it/s, v_num=o30c, train/loss=2.410\r", + "Epoch 0: 1%| | 117/14932 [00:22<47:24, 5.21it/s, v_num=o30c, train/loss=4.220" ] }, { @@ -1961,7 +2240,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 5%[> ] 91.61M 2.56MB/s eta 14m 29s" + "Epoch 0: 1%| | 118/14932 [00:22<47:08, 5.24it/s, v_num=o30c, train/loss=4.220\r", + "Epoch 0: 1%| | 118/14932 [00:22<47:08, 5.24it/s, v_num=o30c, train/loss=6.250" ] }, { @@ -1969,7 +2249,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 5%[> ] 92.27M 2.62MB/s eta 14m 18s" + "Epoch 0: 1%| | 119/14932 [00:22<46:53, 5.27it/s, v_num=o30c, train/loss=6.250\r", + "Epoch 0: 1%| | 119/14932 [00:22<46:53, 5.27it/s, v_num=o30c, train/loss=3.980" ] }, { @@ -1977,7 +2258,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 5%[> ] 92.94M 2.60MB/s eta 14m 18s" + "Epoch 0: 1%| | 120/14932 [00:22<46:37, 5.29it/s, v_num=o30c, train/loss=3.980\r", + "Epoch 0: 1%| | 120/14932 [00:22<46:37, 5.29it/s, v_num=o30c, train/loss=3.360" ] }, { @@ -1985,7 +2267,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 5%[> ] 93.61M 2.66MB/s eta 14m 18s" + "Epoch 0: 1%| | 121/14932 [00:22<46:23, 5.32it/s, v_num=o30c, train/loss=3.360\r", + "Epoch 0: 1%| | 121/14932 [00:22<46:23, 5.32it/s, v_num=o30c, train/loss=3.560" ] }, { @@ -1993,7 +2276,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 5%[> ] 94.30M 2.64MB/s eta 14m 18s" + "Epoch 0: 1%| | 122/14932 [00:22<46:09, 5.35it/s, v_num=o30c, train/loss=3.560\r", + "Epoch 0: 1%| | 122/14932 [00:22<46:09, 5.35it/s, v_num=o30c, train/loss=2.840" ] }, { @@ -2001,7 +2285,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 5%[> ] 94.99M 2.71MB/s eta 14m 18s" + "Epoch 0: 1%| | 123/14932 [00:22<45:56, 5.37it/s, v_num=o30c, train/loss=2.840\r", + "Epoch 0: 1%| | 123/14932 [00:22<45:56, 5.37it/s, v_num=o30c, train/loss=3.810" ] }, { @@ -2009,7 +2294,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 5%[> ] 95.71M 2.70MB/s eta 14m 5s " + "Epoch 0: 1%| | 124/14932 [00:22<45:42, 5.40it/s, v_num=o30c, train/loss=3.810\r", + "Epoch 0: 1%| | 124/14932 [00:22<45:42, 5.40it/s, v_num=o30c, train/loss=3.610" ] }, { @@ -2017,7 +2303,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 5%[> ] 96.43M 2.77MB/s eta 14m 5s " + "Epoch 0: 1%| | 125/14932 [00:23<45:29, 5.43it/s, v_num=o30c, train/loss=3.610\r", + "Epoch 0: 1%| | 125/14932 [00:23<45:29, 5.43it/s, v_num=o30c, train/loss=3.950" ] }, { @@ -2025,7 +2312,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 5%[> ] 97.18M 2.77MB/s eta 14m 5s " + "Epoch 0: 1%| | 126/14932 [00:23<45:16, 5.45it/s, v_num=o30c, train/loss=3.950\r", + "Epoch 0: 1%| | 126/14932 [00:23<45:16, 5.45it/s, v_num=o30c, train/loss=4.470" ] }, { @@ -2033,7 +2321,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 5%[> ] 97.93M 2.85MB/s eta 14m 5s " + "Epoch 0: 1%| | 127/14932 [00:23<45:03, 5.48it/s, v_num=o30c, train/loss=4.470\r", + "Epoch 0: 1%| | 127/14932 [00:23<45:04, 5.48it/s, v_num=o30c, train/loss=3.840" ] }, { @@ -2041,7 +2330,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 6%[> ] 98.71M 2.85MB/s eta 14m 5s " + "Epoch 0: 1%| | 128/14932 [00:23<45:04, 5.47it/s, v_num=o30c, train/loss=3.840\r", + "Epoch 0: 1%| | 128/14932 [00:23<45:04, 5.47it/s, v_num=o30c, train/loss=3.880" ] }, { @@ -2049,7 +2339,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 6%[> ] 99.50M 2.94MB/s eta 13m 50s" + "Epoch 0: 1%| | 129/14932 [00:23<44:52, 5.50it/s, v_num=o30c, train/loss=3.880\r", + "Epoch 0: 1%| | 129/14932 [00:23<44:52, 5.50it/s, v_num=o30c, train/loss=3.310" ] }, { @@ -2057,7 +2348,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 6%[> ] 100.32M 2.95MB/s eta 13m 50s" + "Epoch 0: 1%| | 130/14932 [00:23<44:39, 5.53it/s, v_num=o30c, train/loss=3.310\r", + "Epoch 0: 1%| | 130/14932 [00:23<44:39, 5.52it/s, v_num=o30c, train/loss=1.690" ] }, { @@ -2065,7 +2357,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 6%[> ] 101.16M 3.06MB/s eta 13m 50s" + "Epoch 0: 1%| | 131/14932 [00:23<44:26, 5.55it/s, v_num=o30c, train/loss=1.690\r", + "Epoch 0: 1%| | 131/14932 [00:23<44:27, 5.55it/s, v_num=o30c, train/loss=2.780" ] }, { @@ -2073,7 +2366,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 6%[> ] 102.02M 3.08MB/s eta 13m 50s" + "Epoch 0: 1%| | 132/14932 [00:23<44:14, 5.58it/s, v_num=o30c, train/loss=2.780\r", + "Epoch 0: 1%| | 132/14932 [00:23<44:14, 5.58it/s, v_num=o30c, train/loss=1.970" ] }, { @@ -2081,7 +2375,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 6%[> ] 102.89M 3.20MB/s eta 13m 50s" + "Epoch 0: 1%| | 133/14932 [00:23<44:02, 5.60it/s, v_num=o30c, train/loss=1.970\r", + "Epoch 0: 1%| | 133/14932 [00:23<44:02, 5.60it/s, v_num=o30c, train/loss=3.470" ] }, { @@ -2089,7 +2384,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 6%[> ] 103.79M 3.26MB/s eta 13m 31s" + "Epoch 0: 1%| | 134/14932 [00:23<43:50, 5.63it/s, v_num=o30c, train/loss=3.470\r", + "Epoch 0: 1%| | 134/14932 [00:23<43:50, 5.63it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -2097,7 +2393,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 6%[> ] 104.00M 3.10MB/s eta 13m 31s" + "Epoch 0: 1%| | 135/14932 [00:23<43:38, 5.65it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 1%| | 135/14932 [00:23<43:38, 5.65it/s, v_num=o30c, train/loss=2.750" ] }, { @@ -2105,7 +2402,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 6%[> ] 104.91M 3.28MB/s eta 13m 31s" + "Epoch 0: 1%| | 136/14932 [00:23<43:26, 5.68it/s, v_num=o30c, train/loss=2.750\r", + "Epoch 0: 1%| | 136/14932 [00:23<43:27, 5.68it/s, v_num=o30c, train/loss=5.250" ] }, { @@ -2113,7 +2411,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 6%[> ] 105.80M 3.30MB/s eta 13m 31s" + "Epoch 0: 1%| | 137/14932 [00:24<43:15, 5.70it/s, v_num=o30c, train/loss=5.250\r", + "Epoch 0: 1%| | 137/14932 [00:24<43:15, 5.70it/s, v_num=o30c, train/loss=1.550" ] }, { @@ -2121,7 +2420,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 6%[> ] 106.77M 3.49MB/s eta 13m 31s" + "Epoch 0: 1%| | 138/14932 [00:24<43:05, 5.72it/s, v_num=o30c, train/loss=1.550\r", + "Epoch 0: 1%| | 138/14932 [00:24<43:05, 5.72it/s, v_num=o30c, train/loss=4.500" ] }, { @@ -2129,7 +2429,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 6%[> ] 107.79M 3.57MB/s eta 13m 14s" + "Epoch 0: 1%| | 139/14932 [00:24<42:54, 5.75it/s, v_num=o30c, train/loss=4.500\r", + "Epoch 0: 1%| | 139/14932 [00:24<42:54, 5.75it/s, v_num=o30c, train/loss=4.440" ] }, { @@ -2137,7 +2438,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 6%[> ] 108.85M 3.73MB/s eta 13m 14s" + "Epoch 0: 1%| | 140/14932 [00:24<42:43, 5.77it/s, v_num=o30c, train/loss=4.440\r", + "Epoch 0: 1%| | 140/14932 [00:24<42:43, 5.77it/s, v_num=o30c, train/loss=3.940" ] }, { @@ -2145,7 +2447,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 6%[> ] 109.96M 3.78MB/s eta 13m 14s" + "Epoch 0: 1%| | 141/14932 [00:24<42:32, 5.79it/s, v_num=o30c, train/loss=3.940\r", + "Epoch 0: 1%| | 141/14932 [00:24<42:32, 5.79it/s, v_num=o30c, train/loss=3.560" ] }, { @@ -2153,7 +2456,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 6%[> ] 111.11M 3.96MB/s eta 13m 14s" + "Epoch 0: 1%| | 142/14932 [00:24<42:21, 5.82it/s, v_num=o30c, train/loss=3.560\r", + "Epoch 0: 1%| | 142/14932 [00:24<42:21, 5.82it/s, v_num=o30c, train/loss=3.140" ] }, { @@ -2161,7 +2465,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 6%[> ] 112.30M 4.03MB/s eta 13m 14s" + "Epoch 0: 1%| | 143/14932 [00:24<42:10, 5.84it/s, v_num=o30c, train/loss=3.140\r", + "Epoch 0: 1%| | 143/14932 [00:24<42:11, 5.84it/s, v_num=o30c, train/loss=3.390" ] }, { @@ -2169,7 +2474,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 6%[> ] 113.50M 4.13MB/s eta 12m 47s" + "Epoch 0: 1%| | 144/14932 [00:24<42:00, 5.87it/s, v_num=o30c, train/loss=3.390\r", + "Epoch 0: 1%| | 144/14932 [00:24<42:00, 5.87it/s, v_num=o30c, train/loss=3.770" ] }, { @@ -2177,7 +2483,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 6%[> ] 113.80M 4.11MB/s eta 12m 47s" + "Epoch 0: 1%| | 145/14932 [00:24<41:49, 5.89it/s, v_num=o30c, train/loss=3.770\r", + "Epoch 0: 1%| | 145/14932 [00:24<41:49, 5.89it/s, v_num=o30c, train/loss=3.080" ] }, { @@ -2185,7 +2492,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 7%[> ] 115.05M 4.19MB/s eta 12m 47s" + "Epoch 0: 1%| | 146/14932 [00:24<41:39, 5.91it/s, v_num=o30c, train/loss=3.080\r", + "Epoch 0: 1%| | 146/14932 [00:24<41:39, 5.91it/s, v_num=o30c, train/loss=2.560" ] }, { @@ -2193,7 +2501,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 7%[> ] 116.25M 4.43MB/s eta 12m 47s" + "Epoch 0: 1%| | 147/14932 [00:24<41:29, 5.94it/s, v_num=o30c, train/loss=2.560\r", + "Epoch 0: 1%| | 147/14932 [00:24<41:29, 5.94it/s, v_num=o30c, train/loss=3.030" ] }, { @@ -2201,7 +2510,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 7%[> ] 116.44M 3.99MB/s eta 12m 39s" + "Epoch 0: 1%| | 148/14932 [00:24<41:19, 5.96it/s, v_num=o30c, train/loss=3.030\r", + "Epoch 0: 1%| | 148/14932 [00:24<41:19, 5.96it/s, v_num=o30c, train/loss=3.860" ] }, { @@ -2209,7 +2519,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 7%[> ] 119.21M 4.42MB/s eta 12m 39s" + "Epoch 0: 1%| | 149/14932 [00:24<41:09, 5.99it/s, v_num=o30c, train/loss=3.860\r", + "Epoch 0: 1%| | 149/14932 [00:24<41:09, 5.99it/s, v_num=o30c, train/loss=3.530" ] }, { @@ -2217,7 +2528,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 7%[> ] 120.21M 4.54MB/s eta 12m 39s" + "Epoch 0: 1%| | 150/14932 [00:24<40:59, 6.01it/s, v_num=o30c, train/loss=3.530\r", + "Epoch 0: 1%| | 150/14932 [00:24<40:59, 6.01it/s, v_num=o30c, train/loss=2.980" ] }, { @@ -2225,7 +2537,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 7%[> ] 121.27M 4.48MB/s eta 12m 39s" + "Epoch 0: 1%| | 151/14932 [00:25<40:50, 6.03it/s, v_num=o30c, train/loss=2.980\r", + "Epoch 0: 1%| | 151/14932 [00:25<40:50, 6.03it/s, v_num=o30c, train/loss=4.560" ] }, { @@ -2233,7 +2546,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 7%[> ] 122.36M 4.60MB/s eta 12m 39s" + "Epoch 0: 1%| | 152/14932 [00:25<40:41, 6.05it/s, v_num=o30c, train/loss=4.560\r", + "Epoch 0: 1%| | 152/14932 [00:25<40:41, 6.05it/s, v_num=o30c, train/loss=3.590" ] }, { @@ -2241,7 +2555,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 7%[> ] 123.49M 4.55MB/s eta 12m 8s " + "Epoch 0: 1%| | 153/14932 [00:25<40:31, 6.08it/s, v_num=o30c, train/loss=3.590\r", + "Epoch 0: 1%| | 153/14932 [00:25<40:31, 6.08it/s, v_num=o30c, train/loss=4.000" ] }, { @@ -2249,7 +2564,8 @@ "output_type": "stream", "text": [ "\r", - "pth 7%[> ] 124.64M 4.66MB/s eta 12m 8s " + "Epoch 0: 1%| | 154/14932 [00:25<40:22, 6.10it/s, v_num=o30c, train/loss=4.000\r", + "Epoch 0: 1%| | 154/14932 [00:25<40:22, 6.10it/s, v_num=o30c, train/loss=4.470" ] }, { @@ -2257,7 +2573,8 @@ "output_type": "stream", "text": [ "\r", - "th 7%[> ] 125.82M 4.61MB/s eta 12m 8s " + "Epoch 0: 1%| | 155/14932 [00:25<40:18, 6.11it/s, v_num=o30c, train/loss=4.470\r", + "Epoch 0: 1%| | 155/14932 [00:25<40:18, 6.11it/s, v_num=o30c, train/loss=2.940" ] }, { @@ -2265,7 +2582,8 @@ "output_type": "stream", "text": [ "\r", - "h 7%[> ] 127.02M 4.71MB/s eta 12m 8s " + "Epoch 0: 1%| | 156/14932 [00:25<40:09, 6.13it/s, v_num=o30c, train/loss=2.940\r", + "Epoch 0: 1%| | 156/14932 [00:25<40:09, 6.13it/s, v_num=o30c, train/loss=4.340" ] }, { @@ -2273,7 +2591,8 @@ "output_type": "stream", "text": [ "\r", - " 7%[> ] 128.24M 4.65MB/s eta 12m 8s " + "Epoch 0: 1%| | 157/14932 [00:25<40:00, 6.15it/s, v_num=o30c, train/loss=4.340\r", + "Epoch 0: 1%| | 157/14932 [00:25<40:00, 6.15it/s, v_num=o30c, train/loss=3.330" ] }, { @@ -2281,7 +2600,8 @@ "output_type": "stream", "text": [ "\r", - " v 7%[> ] 129.47M 4.75MB/s eta 11m 46s" + "Epoch 0: 1%| | 158/14932 [00:25<39:51, 6.18it/s, v_num=o30c, train/loss=3.330\r", + "Epoch 0: 1%| | 158/14932 [00:25<39:52, 6.18it/s, v_num=o30c, train/loss=5.220" ] }, { @@ -2289,7 +2609,8 @@ "output_type": "stream", "text": [ "\r", - " v5 7%[> ] 130.74M 4.66MB/s eta 11m 46s" + "Epoch 0: 1%| | 159/14932 [00:25<39:43, 6.20it/s, v_num=o30c, train/loss=5.220\r", + "Epoch 0: 1%| | 159/14932 [00:25<39:43, 6.20it/s, v_num=o30c, train/loss=2.810" ] }, { @@ -2297,7 +2618,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 8%[> ] 132.02M 4.75MB/s eta 11m 46s" + "Epoch 0: 1%| | 160/14932 [00:25<39:46, 6.19it/s, v_num=o30c, train/loss=2.810\r", + "Epoch 0: 1%| | 160/14932 [00:25<39:46, 6.19it/s, v_num=o30c, train/loss=4.120" ] }, { @@ -2305,7 +2627,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 8%[> ] 133.32M 4.66MB/s eta 11m 46s" + "Epoch 0: 1%| | 161/14932 [00:25<39:37, 6.21it/s, v_num=o30c, train/loss=4.120\r", + "Epoch 0: 1%| | 161/14932 [00:25<39:37, 6.21it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -2313,7 +2636,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 8%[> ] 134.63M 4.72MB/s eta 11m 46s" + "Epoch 0: 1%| | 162/14932 [00:25<39:29, 6.23it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -2321,7 +2644,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 8%[> ] 135.96M 4.75MB/s eta 11m 23s" + "Epoch 0: 1%| | 162/14932 [00:25<39:29, 6.23it/s, v_num=o30c, train/loss=2.610" ] }, { @@ -2329,7 +2652,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 8%[> ] 136.72M 4.69MB/s eta 11m 23s" + "Epoch 0: 1%| | 163/14932 [00:26<39:21, 6.26it/s, v_num=o30c, train/loss=2.610\r", + "Epoch 0: 1%| | 163/14932 [00:26<39:21, 6.25it/s, v_num=o30c, train/loss=3.730" ] }, { @@ -2337,7 +2661,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 8%[> ] 139.18M 5.03MB/s eta 11m 23s" + "Epoch 0: 1%| | 164/14932 [00:26<40:04, 6.14it/s, v_num=o30c, train/loss=3.730\r", + "Epoch 0: 1%| | 164/14932 [00:26<40:04, 6.14it/s, v_num=o30c, train/loss=3.140" ] }, { @@ -2345,7 +2670,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 8%[> ] 139.77M 4.88MB/s eta 11m 23s" + "Epoch 0: 1%| | 165/14932 [00:26<39:55, 6.16it/s, v_num=o30c, train/loss=3.140\r", + "Epoch 0: 1%| | 165/14932 [00:26<39:55, 6.16it/s, v_num=o30c, train/loss=4.970" ] }, { @@ -2353,7 +2679,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 8%[> ] 140.38M 4.81MB/s eta 11m 23s" + "Epoch 0: 1%| | 166/14932 [00:26<39:47, 6.18it/s, v_num=o30c, train/loss=4.970\r", + "Epoch 0: 1%| | 166/14932 [00:26<39:47, 6.18it/s, v_num=o30c, train/loss=3.670" ] }, { @@ -2361,7 +2688,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 8%[> ] 141.36M 4.71MB/s eta 11m 7s " + "Epoch 0: 1%| | 167/14932 [00:26<39:38, 6.21it/s, v_num=o30c, train/loss=3.670\r", + "Epoch 0: 1%| | 167/14932 [00:26<39:38, 6.21it/s, v_num=o30c, train/loss=2.770" ] }, { @@ -2369,7 +2697,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 8%[> ] 142.36M 4.86MB/s eta 11m 7s " + "Epoch 0: 1%| | 168/14932 [00:26<39:30, 6.23it/s, v_num=o30c, train/loss=2.770\r", + "Epoch 0: 1%| | 168/14932 [00:26<39:30, 6.23it/s, v_num=o30c, train/loss=4.000" ] }, { @@ -2377,7 +2706,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 8%[> ] 143.33M 4.70MB/s eta 11m 7s " + "Epoch 0: 1%| | 169/14932 [00:27<39:21, 6.25it/s, v_num=o30c, train/loss=4.000\r", + "Epoch 0: 1%| | 169/14932 [00:27<39:21, 6.25it/s, v_num=o30c, train/loss=2.920" ] }, { @@ -2385,7 +2715,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 8%[> ] 144.33M 4.85MB/s eta 11m 7s " + "Epoch 0: 1%| | 170/14932 [00:27<39:13, 6.27it/s, v_num=o30c, train/loss=2.920" ] }, { @@ -2393,7 +2723,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 8%[> ] 145.30M 4.73MB/s eta 11m 7s " + "Epoch 0: 1%| | 170/14932 [00:27<39:13, 6.27it/s, v_num=o30c, train/loss=2.410" ] }, { @@ -2401,7 +2731,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 8%[> ] 146.29M 4.63MB/s eta 10m 53s" + "Epoch 0: 1%| | 171/14932 [00:27<39:06, 6.29it/s, v_num=o30c, train/loss=2.410\r", + "Epoch 0: 1%| | 171/14932 [00:27<39:06, 6.29it/s, v_num=o30c, train/loss=3.470" ] }, { @@ -2409,7 +2740,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 8%[> ] 146.86M 4.59MB/s eta 10m 53s" + "Epoch 0: 1%| | 172/14932 [00:27<38:58, 6.31it/s, v_num=o30c, train/loss=3.470\r", + "Epoch 0: 1%| | 172/14932 [00:27<38:58, 6.31it/s, v_num=o30c, train/loss=2.220" ] }, { @@ -2417,7 +2749,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 8%[> ] 147.63M 4.40MB/s eta 10m 53s" + "Epoch 0: 1%| | 173/14932 [00:27<38:50, 6.33it/s, v_num=o30c, train/loss=2.220\r", + "Epoch 0: 1%| | 173/14932 [00:27<38:50, 6.33it/s, v_num=o30c, train/loss=3.190" ] }, { @@ -2425,7 +2758,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 9%[> ] 148.69M 4.58MB/s eta 10m 53s" + "Epoch 0: 1%| | 174/14932 [00:27<38:43, 6.35it/s, v_num=o30c, train/loss=3.190\r", + "Epoch 0: 1%| | 174/14932 [00:27<38:43, 6.35it/s, v_num=o30c, train/loss=3.700" ] }, { @@ -2433,7 +2767,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 9%[> ] 149.74M 4.35MB/s eta 10m 53s" + "Epoch 0: 1%| | 175/14932 [00:27<38:36, 6.37it/s, v_num=o30c, train/loss=3.700\r", + "Epoch 0: 1%| | 175/14932 [00:27<38:36, 6.37it/s, v_num=o30c, train/loss=3.080" ] }, { @@ -2441,7 +2776,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 9%[> ] 150.80M 4.53MB/s eta 10m 42s" + "Epoch 0: 1%| | 176/14932 [00:27<38:29, 6.39it/s, v_num=o30c, train/loss=3.080\r", + "Epoch 0: 1%| | 176/14932 [00:27<38:29, 6.39it/s, v_num=o30c, train/loss=3.670" ] }, { @@ -2449,7 +2785,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 9%[> ] 151.80M 4.41MB/s eta 10m 42s" + "Epoch 0: 1%| | 177/14932 [00:27<38:22, 6.41it/s, v_num=o30c, train/loss=3.670\r", + "Epoch 0: 1%| | 177/14932 [00:27<38:22, 6.41it/s, v_num=o30c, train/loss=1.950" ] }, { @@ -2457,7 +2794,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 9%[> ] 152.85M 4.72MB/s eta 10m 42s" + "Epoch 0: 1%| | 178/14932 [00:27<38:15, 6.43it/s, v_num=o30c, train/loss=1.950\r", + "Epoch 0: 1%| | 178/14932 [00:27<38:15, 6.43it/s, v_num=o30c, train/loss=2.360" ] }, { @@ -2465,7 +2803,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 9%[> ] 152.89M 4.05MB/s eta 10m 42s" + "Epoch 0: 1%| | 179/14932 [00:27<38:08, 6.45it/s, v_num=o30c, train/loss=2.360\r", + "Epoch 0: 1%| | 179/14932 [00:27<38:08, 6.45it/s, v_num=o30c, train/loss=2.500" ] }, { @@ -2473,7 +2812,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 9%[> ] 154.83M 4.28MB/s eta 10m 42s" + "Epoch 0: 1%| | 180/14932 [00:27<38:01, 6.46it/s, v_num=o30c, train/loss=2.500\r", + "Epoch 0: 1%| | 180/14932 [00:27<38:01, 6.46it/s, v_num=o30c, train/loss=1.340" ] }, { @@ -2481,7 +2821,7 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 9%[> ] 155.60M 4.21MB/s eta 10m 32s" + "Epoch 0: 1%| | 181/14932 [00:27<37:55, 6.48it/s, v_num=o30c, train/loss=1.340" ] }, { @@ -2489,7 +2829,7 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 9%[> ] 155.96M 4.22MB/s eta 10m 32s" + "Epoch 0: 1%| | 181/14932 [00:27<37:55, 6.48it/s, v_num=o30c, train/loss=3.360" ] }, { @@ -2497,7 +2837,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 9%[> ] 156.61M 4.00MB/s eta 10m 32s" + "Epoch 0: 1%| | 182/14932 [00:27<37:48, 6.50it/s, v_num=o30c, train/loss=3.360\r", + "Epoch 0: 1%| | 182/14932 [00:27<37:48, 6.50it/s, v_num=o30c, train/loss=4.280" ] }, { @@ -2505,7 +2846,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 9%[> ] 157.46M 4.09MB/s eta 10m 32s" + "Epoch 0: 1%| | 183/14932 [00:28<37:41, 6.52it/s, v_num=o30c, train/loss=4.280\r", + "Epoch 0: 1%| | 183/14932 [00:28<37:41, 6.52it/s, v_num=o30c, train/loss=4.500" ] }, { @@ -2513,7 +2855,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 9%[> ] 158.30M 3.88MB/s eta 10m 32s" + "Epoch 0: 1%| | 184/14932 [00:28<37:35, 6.54it/s, v_num=o30c, train/loss=4.500\r", + "Epoch 0: 1%| | 184/14932 [00:28<37:35, 6.54it/s, v_num=o30c, train/loss=4.060" ] }, { @@ -2521,7 +2864,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 9%[> ] 159.18M 3.97MB/s eta 10m 27s" + "Epoch 0: 1%| | 185/14932 [00:28<37:28, 6.56it/s, v_num=o30c, train/loss=4.060\r", + "Epoch 0: 1%| | 185/14932 [00:28<37:28, 6.56it/s, v_num=o30c, train/loss=2.450" ] }, { @@ -2529,7 +2873,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 9%[> ] 160.05M 3.86MB/s eta 10m 27s" + "Epoch 0: 1%| | 186/14932 [00:28<37:22, 6.58it/s, v_num=o30c, train/loss=2.450\r", + "Epoch 0: 1%| | 186/14932 [00:28<37:22, 6.58it/s, v_num=o30c, train/loss=3.340" ] }, { @@ -2537,7 +2882,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 9%[> ] 160.93M 3.87MB/s eta 10m 27s" + "Epoch 0: 1%| | 187/14932 [00:28<37:16, 6.59it/s, v_num=o30c, train/loss=3.340\r", + "Epoch 0: 1%| | 187/14932 [00:28<37:16, 6.59it/s, v_num=o30c, train/loss=3.560" ] }, { @@ -2545,7 +2891,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 9%[> ] 161.83M 3.77MB/s eta 10m 27s" + "Epoch 0: 1%| | 188/14932 [00:28<37:09, 6.61it/s, v_num=o30c, train/loss=3.560\r", + "Epoch 0: 1%| | 188/14932 [00:28<37:09, 6.61it/s, v_num=o30c, train/loss=4.090" ] }, { @@ -2553,7 +2900,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 9%[> ] 162.72M 3.78MB/s eta 10m 27s" + "Epoch 0: 1%| | 189/14932 [00:28<37:03, 6.63it/s, v_num=o30c, train/loss=4.090\r", + "Epoch 0: 1%| | 189/14932 [00:28<37:03, 6.63it/s, v_num=o30c, train/loss=3.450" ] }, { @@ -2561,7 +2909,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 9%[> ] 163.64M 3.67MB/s eta 10m 19s" + "Epoch 0: 1%| | 190/14932 [00:28<36:58, 6.64it/s, v_num=o30c, train/loss=3.450\r", + "Epoch 0: 1%| | 190/14932 [00:28<36:58, 6.64it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -2569,7 +2918,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 10%[=> ] 164.57M 3.68MB/s eta 10m 19s" + "Epoch 0: 1%| | 191/14932 [00:28<36:52, 6.66it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 1%| | 191/14932 [00:28<36:52, 6.66it/s, v_num=o30c, train/loss=3.670" ] }, { @@ -2577,7 +2927,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 10%[=> ] 165.49M 3.57MB/s eta 10m 19s" + "Epoch 0: 1%| | 192/14932 [00:28<36:55, 6.65it/s, v_num=o30c, train/loss=3.670\r", + "Epoch 0: 1%| | 192/14932 [00:28<36:55, 6.65it/s, v_num=o30c, train/loss=2.880" ] }, { @@ -2585,7 +2936,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 10%[=> ] 166.41M 3.77MB/s eta 10m 19s" + "Epoch 0: 1%| | 193/14932 [00:28<36:49, 6.67it/s, v_num=o30c, train/loss=2.880\r", + "Epoch 0: 1%| | 193/14932 [00:28<36:49, 6.67it/s, v_num=o30c, train/loss=3.730" ] }, { @@ -2593,7 +2945,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 10%[=> ] 167.35M 3.57MB/s eta 10m 19s" + "Epoch 0: 1%| | 194/14932 [00:29<36:43, 6.69it/s, v_num=o30c, train/loss=3.730\r", + "Epoch 0: 1%| | 194/14932 [00:29<36:43, 6.69it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -2601,7 +2954,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 10%[=> ] 168.29M 3.54MB/s eta 10m 11s" + "Epoch 0: 1%| | 195/14932 [00:29<36:37, 6.71it/s, v_num=o30c, train/loss=3.480\r", + "Epoch 0: 1%| | 195/14932 [00:29<36:37, 6.71it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -2609,7 +2963,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 10%[=> ] 169.22M 3.64MB/s eta 10m 11s" + "Epoch 0: 1%| | 196/14932 [00:29<36:36, 6.71it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 1%| | 196/14932 [00:29<36:36, 6.71it/s, v_num=o30c, train/loss=3.840" ] }, { @@ -2617,7 +2972,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 10%[=> ] 170.18M 3.55MB/s eta 10m 11s" + "Epoch 0: 1%| | 197/14932 [00:29<36:30, 6.73it/s, v_num=o30c, train/loss=3.840\r", + "Epoch 0: 1%| | 197/14932 [00:29<36:30, 6.73it/s, v_num=o30c, train/loss=3.300" ] }, { @@ -2625,7 +2981,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 10%[=> ] 171.11M 3.72MB/s eta 10m 11s" + "Epoch 0: 1%| | 198/14932 [00:29<36:24, 6.74it/s, v_num=o30c, train/loss=3.300\r", + "Epoch 0: 1%| | 198/14932 [00:29<36:24, 6.74it/s, v_num=o30c, train/loss=1.560" ] }, { @@ -2633,7 +2990,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 10%[=> ] 172.07M 3.67MB/s eta 10m 11s" + "Epoch 0: 1%| | 199/14932 [00:29<36:19, 6.76it/s, v_num=o30c, train/loss=1.560\r", + "Epoch 0: 1%| | 199/14932 [00:29<36:19, 6.76it/s, v_num=o30c, train/loss=3.800" ] }, { @@ -2641,7 +2999,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 10%[=> ] 172.14M 3.33MB/s eta 10m 7s " + "Epoch 0: 1%| | 200/14932 [00:29<36:13, 6.78it/s, v_num=o30c, train/loss=3.800\r", + "Epoch 0: 1%| | 200/14932 [00:29<36:13, 6.78it/s, v_num=o30c, train/loss=3.670" ] }, { @@ -2649,7 +3008,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 10%[=> ] 174.02M 3.63MB/s eta 10m 7s " + "Epoch 0: 1%| | 201/14932 [00:29<36:07, 6.80it/s, v_num=o30c, train/loss=3.670\r", + "Epoch 0: 1%| | 201/14932 [00:29<36:07, 6.80it/s, v_num=o30c, train/loss=2.660" ] }, { @@ -2657,7 +3017,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 10%[=> ] 174.29M 3.62MB/s eta 10m 7s " + "Epoch 0: 1%| | 202/14932 [00:29<36:02, 6.81it/s, v_num=o30c, train/loss=2.660\r", + "Epoch 0: 1%| | 202/14932 [00:29<36:02, 6.81it/s, v_num=o30c, train/loss=4.720" ] }, { @@ -2665,7 +3026,8 @@ "output_type": "stream", "text": [ "\r", - "pth 10%[=> ] 174.97M 3.50MB/s eta 10m 7s " + "Epoch 0: 1%| | 203/14932 [00:29<35:56, 6.83it/s, v_num=o30c, train/loss=4.720\r", + "Epoch 0: 1%| | 203/14932 [00:29<35:56, 6.83it/s, v_num=o30c, train/loss=4.780" ] }, { @@ -2673,7 +3035,8 @@ "output_type": "stream", "text": [ "\r", - "th 10%[=> ] 175.66M 3.38MB/s eta 10m 7s " + "Epoch 0: 1%| | 204/14932 [00:29<35:52, 6.84it/s, v_num=o30c, train/loss=4.780\r", + "Epoch 0: 1%| | 204/14932 [00:29<35:52, 6.84it/s, v_num=o30c, train/loss=3.230" ] }, { @@ -2681,7 +3044,8 @@ "output_type": "stream", "text": [ "\r", - "h 10%[=> ] 176.36M 3.41MB/s eta 10m 0s " + "Epoch 0: 1%| | 205/14932 [00:29<35:46, 6.86it/s, v_num=o30c, train/loss=3.230\r", + "Epoch 0: 1%| | 205/14932 [00:29<35:46, 6.86it/s, v_num=o30c, train/loss=4.280" ] }, { @@ -2689,7 +3053,8 @@ "output_type": "stream", "text": [ "\r", - " 10%[=> ] 177.08M 3.27MB/s eta 10m 0s " + "Epoch 0: 1%| | 206/14932 [00:29<35:41, 6.88it/s, v_num=o30c, train/loss=4.280\r", + "Epoch 0: 1%| | 206/14932 [00:29<35:41, 6.88it/s, v_num=o30c, train/loss=3.860" ] }, { @@ -2697,7 +3062,8 @@ "output_type": "stream", "text": [ "\r", - " v 10%[=> ] 177.82M 3.29MB/s eta 10m 0s " + "Epoch 0: 1%| | 207/14932 [00:30<35:36, 6.89it/s, v_num=o30c, train/loss=3.860\r", + "Epoch 0: 1%| | 207/14932 [00:30<35:36, 6.89it/s, v_num=o30c, train/loss=3.200" ] }, { @@ -2705,7 +3071,8 @@ "output_type": "stream", "text": [ "\r", - " v5 10%[=> ] 178.57M 3.19MB/s eta 10m 0s " + "Epoch 0: 1%| | 208/14932 [00:30<35:30, 6.91it/s, v_num=o30c, train/loss=3.200\r", + "Epoch 0: 1%| | 208/14932 [00:30<35:30, 6.91it/s, v_num=o30c, train/loss=3.220" ] }, { @@ -2713,7 +3080,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 10%[=> ] 179.32M 3.18MB/s eta 9m 57s " + "Epoch 0: 1%| | 209/14932 [00:30<35:25, 6.93it/s, v_num=o30c, train/loss=3.220\r", + "Epoch 0: 1%| | 209/14932 [00:30<35:25, 6.93it/s, v_num=o30c, train/loss=2.810" ] }, { @@ -2721,7 +3089,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 10%[=> ] 180.07M 3.09MB/s eta 9m 57s " + "Epoch 0: 1%| | 210/14932 [00:30<35:20, 6.94it/s, v_num=o30c, train/loss=2.810\r", + "Epoch 0: 1%| | 210/14932 [00:30<35:20, 6.94it/s, v_num=o30c, train/loss=4.380" ] }, { @@ -2729,7 +3098,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 11%[=> ] 180.83M 3.09MB/s eta 9m 57s " + "Epoch 0: 1%| | 211/14932 [00:30<35:15, 6.96it/s, v_num=o30c, train/loss=4.380\r", + "Epoch 0: 1%| | 211/14932 [00:30<35:15, 6.96it/s, v_num=o30c, train/loss=1.910" ] }, { @@ -2737,7 +3107,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 11%[=> ] 181.61M 2.99MB/s eta 9m 57s " + "Epoch 0: 1%| | 212/14932 [00:30<35:10, 6.97it/s, v_num=o30c, train/loss=1.910\r", + "Epoch 0: 1%| | 212/14932 [00:30<35:10, 6.97it/s, v_num=o30c, train/loss=3.060" ] }, { @@ -2745,7 +3116,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 11%[=> ] 182.39M 3.10MB/s eta 9m 57s " + "Epoch 0: 1%| | 213/14932 [00:30<35:05, 6.99it/s, v_num=o30c, train/loss=3.060\r", + "Epoch 0: 1%| | 213/14932 [00:30<35:05, 6.99it/s, v_num=o30c, train/loss=3.160" ] }, { @@ -2753,7 +3125,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 11%[=> ] 183.18M 2.92MB/s eta 9m 53s " + "Epoch 0: 1%| | 214/14932 [00:30<35:00, 7.01it/s, v_num=o30c, train/loss=3.160\r", + "Epoch 0: 1%| | 214/14932 [00:30<35:00, 7.01it/s, v_num=o30c, train/loss=4.220" ] }, { @@ -2761,7 +3134,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 11%[=> ] 183.96M 3.26MB/s eta 9m 53s " + "Epoch 0: 1%| | 215/14932 [00:30<34:55, 7.02it/s, v_num=o30c, train/loss=4.220\r", + "Epoch 0: 1%| | 215/14932 [00:30<34:55, 7.02it/s, v_num=o30c, train/loss=3.670" ] }, { @@ -2769,7 +3143,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 11%[=> ] 184.75M 3.04MB/s eta 9m 53s " + "Epoch 0: 1%| | 216/14932 [00:30<34:50, 7.04it/s, v_num=o30c, train/loss=3.670\r", + "Epoch 0: 1%| | 216/14932 [00:30<34:50, 7.04it/s, v_num=o30c, train/loss=2.380" ] }, { @@ -2777,7 +3152,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 11%[=> ] 185.55M 3.01MB/s eta 9m 53s " + "Epoch 0: 1%| | 217/14932 [00:30<34:45, 7.06it/s, v_num=o30c, train/loss=2.380\r", + "Epoch 0: 1%| | 217/14932 [00:30<34:45, 7.06it/s, v_num=o30c, train/loss=4.220" ] }, { @@ -2785,7 +3161,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 11%[=> ] 186.35M 3.00MB/s eta 9m 53s " + "Epoch 0: 1%| | 218/14932 [00:30<34:40, 7.07it/s, v_num=o30c, train/loss=4.220\r", + "Epoch 0: 1%| | 218/14932 [00:30<34:40, 7.07it/s, v_num=o30c, train/loss=2.890" ] }, { @@ -2793,7 +3170,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 11%[=> ] 187.14M 3.11MB/s eta 9m 48s " + "Epoch 0: 1%| | 219/14932 [00:30<34:40, 7.07it/s, v_num=o30c, train/loss=2.890\r", + "Epoch 0: 1%| | 219/14932 [00:30<34:40, 7.07it/s, v_num=o30c, train/loss=4.440" ] }, { @@ -2801,7 +3179,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 11%[=> ] 187.96M 3.06MB/s eta 9m 48s " + "Epoch 0: 1%| | 220/14932 [00:31<34:34, 7.09it/s, v_num=o30c, train/loss=4.440\r", + "Epoch 0: 1%| | 220/14932 [00:31<34:34, 7.09it/s, v_num=o30c, train/loss=3.980" ] }, { @@ -2809,7 +3188,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 11%[=> ] 188.68M 3.00MB/s eta 9m 48s " + "Epoch 0: 1%| | 221/14932 [00:31<34:30, 7.11it/s, v_num=o30c, train/loss=3.980\r", + "Epoch 0: 1%| | 221/14932 [00:31<34:30, 7.11it/s, v_num=o30c, train/loss=1.910" ] }, { @@ -2817,7 +3197,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 11%[=> ] 190.07M 3.14MB/s eta 9m 48s " + "Epoch 0: 1%| | 222/14932 [00:31<34:25, 7.12it/s, v_num=o30c, train/loss=1.910\r", + "Epoch 0: 1%| | 222/14932 [00:31<34:25, 7.12it/s, v_num=o30c, train/loss=4.380" ] }, { @@ -2825,7 +3206,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 11%[=> ] 190.63M 3.09MB/s eta 9m 45s " + "Epoch 0: 1%| | 223/14932 [00:31<34:21, 7.13it/s, v_num=o30c, train/loss=4.380\r", + "Epoch 0: 1%| | 223/14932 [00:31<34:21, 7.13it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -2833,7 +3215,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 11%[=> ] 191.18M 3.16MB/s eta 9m 45s " + "Epoch 0: 2%| | 224/14932 [00:31<34:24, 7.12it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 2%| | 224/14932 [00:31<34:24, 7.12it/s, v_num=o30c, train/loss=2.890" ] }, { @@ -2841,7 +3224,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 11%[=> ] 191.68M 3.04MB/s eta 9m 45s " + "Epoch 0: 2%| | 225/14932 [00:31<34:19, 7.14it/s, v_num=o30c, train/loss=2.890\r", + "Epoch 0: 2%| | 225/14932 [00:31<34:19, 7.14it/s, v_num=o30c, train/loss=4.280" ] }, { @@ -2849,7 +3233,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 11%[=> ] 192.27M 3.08MB/s eta 9m 45s " + "Epoch 0: 2%| | 226/14932 [00:31<34:15, 7.16it/s, v_num=o30c, train/loss=4.280\r", + "Epoch 0: 2%| | 226/14932 [00:31<34:15, 7.16it/s, v_num=o30c, train/loss=2.700" ] }, { @@ -2857,7 +3242,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 11%[=> ] 192.88M 3.01MB/s eta 9m 45s " + "Epoch 0: 2%| | 227/14932 [00:31<34:10, 7.17it/s, v_num=o30c, train/loss=2.700\r", + "Epoch 0: 2%| | 227/14932 [00:31<34:10, 7.17it/s, v_num=o30c, train/loss=1.350" ] }, { @@ -2865,7 +3251,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 11%[=> ] 193.11M 2.85MB/s eta 9m 44s " + "Epoch 0: 2%| | 228/14932 [00:31<34:06, 7.19it/s, v_num=o30c, train/loss=1.350\r", + "Epoch 0: 2%| | 228/14932 [00:31<34:06, 7.19it/s, v_num=o30c, train/loss=1.530" ] }, { @@ -2873,7 +3260,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 11%[=> ] 193.74M 2.82MB/s eta 9m 44s " + "Epoch 0: 2%| | 229/14932 [00:31<34:01, 7.20it/s, v_num=o30c, train/loss=1.530\r", + "Epoch 0: 2%| | 229/14932 [00:31<34:01, 7.20it/s, v_num=o30c, train/loss=2.470" ] }, { @@ -2881,7 +3269,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 11%[=> ] 194.38M 2.81MB/s eta 9m 44s " + "Epoch 0: 2%| | 230/14932 [00:31<33:57, 7.22it/s, v_num=o30c, train/loss=2.470\r", + "Epoch 0: 2%| | 230/14932 [00:31<33:57, 7.22it/s, v_num=o30c, train/loss=2.280" ] }, { @@ -2889,7 +3278,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 11%[=> ] 195.02M 2.77MB/s eta 9m 44s " + "Epoch 0: 2%| | 231/14932 [00:31<33:52, 7.23it/s, v_num=o30c, train/loss=2.280\r", + "Epoch 0: 2%| | 231/14932 [00:31<33:52, 7.23it/s, v_num=o30c, train/loss=3.780" ] }, { @@ -2897,7 +3287,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 11%[=> ] 195.68M 2.84MB/s eta 9m 44s " + "Epoch 0: 2%| | 232/14932 [00:32<33:48, 7.25it/s, v_num=o30c, train/loss=3.780\r", + "Epoch 0: 2%| | 232/14932 [00:32<33:48, 7.25it/s, v_num=o30c, train/loss=3.250" ] }, { @@ -2905,7 +3296,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 11%[=> ] 196.33M 2.69MB/s eta 9m 42s " + "Epoch 0: 2%| | 233/14932 [00:32<33:44, 7.26it/s, v_num=o30c, train/loss=3.250\r", + "Epoch 0: 2%| | 233/14932 [00:32<33:44, 7.26it/s, v_num=o30c, train/loss=4.190" ] }, { @@ -2913,7 +3305,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 12%[=> ] 196.99M 2.61MB/s eta 9m 42s " + "Epoch 0: 2%| | 234/14932 [00:32<33:40, 7.28it/s, v_num=o30c, train/loss=4.190\r", + "Epoch 0: 2%| | 234/14932 [00:32<33:40, 7.28it/s, v_num=o30c, train/loss=2.220" ] }, { @@ -2921,7 +3314,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 12%[=> ] 198.13M 2.65MB/s eta 9m 42s " + "Epoch 0: 2%| | 235/14932 [00:32<33:35, 7.29it/s, v_num=o30c, train/loss=2.220\r", + "Epoch 0: 2%| | 235/14932 [00:32<33:36, 7.29it/s, v_num=o30c, train/loss=2.340" ] }, { @@ -2929,7 +3323,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 12%[=> ] 198.60M 2.58MB/s eta 9m 42s " + "Epoch 0: 2%| | 236/14932 [00:32<33:32, 7.30it/s, v_num=o30c, train/loss=2.340\r", + "Epoch 0: 2%| | 236/14932 [00:32<33:32, 7.30it/s, v_num=o30c, train/loss=4.410" ] }, { @@ -2937,7 +3332,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 12%[=> ] 199.08M 2.69MB/s eta 9m 41s " + "Epoch 0: 2%| | 237/14932 [00:32<33:28, 7.32it/s, v_num=o30c, train/loss=4.410\r", + "Epoch 0: 2%| | 237/14932 [00:32<33:28, 7.32it/s, v_num=o30c, train/loss=3.500" ] }, { @@ -2945,7 +3341,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 12%[=> ] 199.58M 2.43MB/s eta 9m 41s " + "Epoch 0: 2%| | 238/14932 [00:32<33:24, 7.33it/s, v_num=o30c, train/loss=3.500\r", + "Epoch 0: 2%| | 238/14932 [00:32<33:24, 7.33it/s, v_num=o30c, train/loss=4.090" ] }, { @@ -2953,7 +3350,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 12%[=> ] 200.08M 2.41MB/s eta 9m 41s " + "Epoch 0: 2%| | 239/14932 [00:32<33:19, 7.35it/s, v_num=o30c, train/loss=4.090\r", + "Epoch 0: 2%| | 239/14932 [00:32<33:19, 7.35it/s, v_num=o30c, train/loss=2.830" ] }, { @@ -2961,7 +3359,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 12%[=> ] 200.60M 2.44MB/s eta 9m 41s " + "Epoch 0: 2%| | 240/14932 [00:32<33:16, 7.36it/s, v_num=o30c, train/loss=2.830\r", + "Epoch 0: 2%| | 240/14932 [00:32<33:16, 7.36it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -2969,7 +3368,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 12%[=> ] 201.13M 2.39MB/s eta 9m 41s " + "Epoch 0: 2%| | 241/14932 [00:32<33:11, 7.38it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 2%| | 241/14932 [00:32<33:12, 7.37it/s, v_num=o30c, train/loss=4.250" ] }, { @@ -2977,7 +3377,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 12%[=> ] 201.66M 2.37MB/s eta 9m 41s " + "Epoch 0: 2%| | 242/14932 [00:32<33:08, 7.39it/s, v_num=o30c, train/loss=4.250\r", + "Epoch 0: 2%| | 242/14932 [00:32<33:08, 7.39it/s, v_num=o30c, train/loss=2.800" ] }, { @@ -2985,7 +3386,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 12%[=> ] 202.19M 2.37MB/s eta 9m 41s " + "Epoch 0: 2%| | 243/14932 [00:32<33:04, 7.40it/s, v_num=o30c, train/loss=2.800\r", + "Epoch 0: 2%| | 243/14932 [00:32<33:04, 7.40it/s, v_num=o30c, train/loss=3.120" ] }, { @@ -2993,7 +3395,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 12%[=> ] 202.74M 2.33MB/s eta 9m 41s " + "Epoch 0: 2%| | 244/14932 [00:32<33:04, 7.40it/s, v_num=o30c, train/loss=3.120\r", + "Epoch 0: 2%| | 244/14932 [00:32<33:04, 7.40it/s, v_num=o30c, train/loss=2.920" ] }, { @@ -3001,7 +3404,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 12%[=> ] 203.27M 2.30MB/s eta 9m 41s " + "Epoch 0: 2%| | 245/14932 [00:33<33:00, 7.42it/s, v_num=o30c, train/loss=2.920\r", + "Epoch 0: 2%| | 245/14932 [00:33<33:00, 7.42it/s, v_num=o30c, train/loss=3.910" ] }, { @@ -3009,7 +3413,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 12%[=> ] 203.83M 2.34MB/s eta 9m 41s " + "Epoch 0: 2%| | 246/14932 [00:33<32:56, 7.43it/s, v_num=o30c, train/loss=3.910\r", + "Epoch 0: 2%| | 246/14932 [00:33<32:56, 7.43it/s, v_num=o30c, train/loss=4.470" ] }, { @@ -3017,7 +3422,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 12%[=> ] 204.38M 2.26MB/s eta 9m 41s " + "Epoch 0: 2%| | 247/14932 [00:33<32:52, 7.44it/s, v_num=o30c, train/loss=4.470\r", + "Epoch 0: 2%| | 247/14932 [00:33<32:52, 7.44it/s, v_num=o30c, train/loss=2.580" ] }, { @@ -3025,7 +3431,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 12%[=> ] 204.93M 2.21MB/s eta 9m 41s " + "Epoch 0: 2%| | 248/14932 [00:33<32:48, 7.46it/s, v_num=o30c, train/loss=2.580\r", + "Epoch 0: 2%| | 248/14932 [00:33<32:48, 7.46it/s, v_num=o30c, train/loss=3.360" ] }, { @@ -3033,7 +3440,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 12%[=> ] 205.49M 2.40MB/s eta 9m 41s " + "Epoch 0: 2%| | 249/14932 [00:33<32:44, 7.47it/s, v_num=o30c, train/loss=3.360\r", + "Epoch 0: 2%| | 249/14932 [00:33<32:44, 7.47it/s, v_num=o30c, train/loss=2.060" ] }, { @@ -3041,7 +3449,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 12%[=> ] 206.05M 2.21MB/s eta 9m 41s " + "Epoch 0: 2%| | 250/14932 [00:33<32:40, 7.49it/s, v_num=o30c, train/loss=2.060\r", + "Epoch 0: 2%| | 250/14932 [00:33<32:40, 7.49it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -3049,7 +3458,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 12%[=> ] 206.61M 2.21MB/s eta 9m 41s " + "Epoch 0: 2%| | 251/14932 [00:33<32:36, 7.50it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 2%| | 251/14932 [00:33<32:36, 7.50it/s, v_num=o30c, train/loss=3.230" ] }, { @@ -3057,7 +3467,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 12%[=> ] 207.18M 2.30MB/s eta 9m 40s " + "Epoch 0: 2%| | 252/14932 [00:33<32:32, 7.52it/s, v_num=o30c, train/loss=3.230\r", + "Epoch 0: 2%| | 252/14932 [00:33<32:32, 7.52it/s, v_num=o30c, train/loss=3.230" ] }, { @@ -3065,7 +3476,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 12%[=> ] 207.75M 2.26MB/s eta 9m 40s " + "Epoch 0: 2%| | 253/14932 [00:33<32:28, 7.53it/s, v_num=o30c, train/loss=3.230\r", + "Epoch 0: 2%| | 253/14932 [00:33<32:28, 7.53it/s, v_num=o30c, train/loss=3.800" ] }, { @@ -3073,7 +3485,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 12%[=> ] 208.32M 2.26MB/s eta 9m 40s " + "Epoch 0: 2%| | 254/14932 [00:33<32:25, 7.55it/s, v_num=o30c, train/loss=3.800\r", + "Epoch 0: 2%| | 254/14932 [00:33<32:25, 7.55it/s, v_num=o30c, train/loss=2.330" ] }, { @@ -3081,7 +3494,8 @@ "output_type": "stream", "text": [ "\r", - "pth 12%[=> ] 208.88M 2.34MB/s eta 9m 40s " + "Epoch 0: 2%| | 255/14932 [00:33<32:22, 7.56it/s, v_num=o30c, train/loss=2.330\r", + "Epoch 0: 2%| | 255/14932 [00:33<32:22, 7.56it/s, v_num=o30c, train/loss=2.940" ] }, { @@ -3089,7 +3503,7 @@ "output_type": "stream", "text": [ "\r", - "th 12%[=> ] 209.44M 2.29MB/s eta 9m 40s " + "Epoch 0: 2%| | 256/14932 [00:33<32:25, 7.54it/s, v_num=o30c, train/loss=2.940" ] }, { @@ -3097,7 +3511,7 @@ "output_type": "stream", "text": [ "\r", - "h 12%[=> ] 210.02M 2.30MB/s eta 9m 39s " + "Epoch 0: 2%| | 256/14932 [00:33<32:25, 7.54it/s, v_num=o30c, train/loss=3.720" ] }, { @@ -3105,7 +3519,8 @@ "output_type": "stream", "text": [ "\r", - " 12%[=> ] 210.58M 2.37MB/s eta 9m 39s " + "Epoch 0: 2%| | 257/14932 [00:34<32:22, 7.56it/s, v_num=o30c, train/loss=3.720\r", + "Epoch 0: 2%| | 257/14932 [00:34<32:22, 7.56it/s, v_num=o30c, train/loss=2.380" ] }, { @@ -3113,7 +3528,8 @@ "output_type": "stream", "text": [ "\r", - " v 12%[=> ] 211.14M 2.31MB/s eta 9m 39s " + "Epoch 0: 2%| | 258/14932 [00:34<32:18, 7.57it/s, v_num=o30c, train/loss=2.380\r", + "Epoch 0: 2%| | 258/14932 [00:34<32:18, 7.57it/s, v_num=o30c, train/loss=3.300" ] }, { @@ -3121,7 +3537,8 @@ "output_type": "stream", "text": [ "\r", - " v5 12%[=> ] 211.71M 2.31MB/s eta 9m 39s " + "Epoch 0: 2%| | 259/14932 [00:34<32:14, 7.58it/s, v_num=o30c, train/loss=3.300\r", + "Epoch 0: 2%| | 259/14932 [00:34<32:14, 7.58it/s, v_num=o30c, train/loss=1.600" ] }, { @@ -3129,7 +3546,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 12%[=> ] 212.27M 2.34MB/s eta 9m 39s " + "Epoch 0: 2%| | 260/14932 [00:34<32:11, 7.60it/s, v_num=o30c, train/loss=1.600\r", + "Epoch 0: 2%| | 260/14932 [00:34<32:11, 7.60it/s, v_num=o30c, train/loss=3.080" ] }, { @@ -3137,7 +3555,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 12%[=> ] 212.85M 2.31MB/s eta 9m 38s " + "Epoch 0: 2%| | 261/14932 [00:34<32:07, 7.61it/s, v_num=o30c, train/loss=3.080\r", + "Epoch 0: 2%| | 261/14932 [00:34<32:07, 7.61it/s, v_num=o30c, train/loss=2.000" ] }, { @@ -3145,7 +3564,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 13%[=> ] 213.41M 2.35MB/s eta 9m 38s " + "Epoch 0: 2%| | 262/14932 [00:34<32:04, 7.62it/s, v_num=o30c, train/loss=2.000\r", + "Epoch 0: 2%| | 262/14932 [00:34<32:04, 7.62it/s, v_num=o30c, train/loss=3.970" ] }, { @@ -3153,7 +3573,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 13%[=> ] 213.97M 2.32MB/s eta 9m 38s " + "Epoch 0: 2%| | 263/14932 [00:34<32:01, 7.64it/s, v_num=o30c, train/loss=3.970\r", + "Epoch 0: 2%| | 263/14932 [00:34<32:01, 7.64it/s, v_num=o30c, train/loss=3.060" ] }, { @@ -3161,7 +3582,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 13%[=> ] 214.54M 2.31MB/s eta 9m 38s " + "Epoch 0: 2%| | 264/14932 [00:34<31:58, 7.65it/s, v_num=o30c, train/loss=3.060\r", + "Epoch 0: 2%| | 264/14932 [00:34<31:58, 7.65it/s, v_num=o30c, train/loss=2.410" ] }, { @@ -3169,7 +3591,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 13%[=> ] 215.11M 2.35MB/s eta 9m 38s " + "Epoch 0: 2%| | 265/14932 [00:34<31:54, 7.66it/s, v_num=o30c, train/loss=2.410\r", + "Epoch 0: 2%| | 265/14932 [00:34<31:54, 7.66it/s, v_num=o30c, train/loss=2.500" ] }, { @@ -3177,7 +3600,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 13%[=> ] 215.68M 2.33MB/s eta 9m 37s " + "Epoch 0: 2%| | 266/14932 [00:34<31:51, 7.67it/s, v_num=o30c, train/loss=2.500\r", + "Epoch 0: 2%| | 266/14932 [00:34<31:51, 7.67it/s, v_num=o30c, train/loss=4.190" ] }, { @@ -3185,7 +3609,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 13%[=> ] 216.25M 2.37MB/s eta 9m 37s " + "Epoch 0: 2%| | 267/14932 [00:34<31:51, 7.67it/s, v_num=o30c, train/loss=4.190\r", + "Epoch 0: 2%| | 267/14932 [00:34<31:51, 7.67it/s, v_num=o30c, train/loss=3.530" ] }, { @@ -3193,7 +3618,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 13%[=> ] 216.82M 2.35MB/s eta 9m 37s " + "Epoch 0: 2%| | 268/14932 [00:34<31:47, 7.69it/s, v_num=o30c, train/loss=3.530\r", + "Epoch 0: 2%| | 268/14932 [00:34<31:47, 7.69it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -3201,7 +3627,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 13%[=> ] 217.39M 2.34MB/s eta 9m 37s " + "Epoch 0: 2%| | 269/14932 [00:34<31:44, 7.70it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 2%| | 269/14932 [00:34<31:44, 7.70it/s, v_num=o30c, train/loss=3.280" ] }, { @@ -3209,7 +3636,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 13%[=> ] 217.97M 2.36MB/s eta 9m 37s " + "Epoch 0: 2%| | 270/14932 [00:35<31:41, 7.71it/s, v_num=o30c, train/loss=3.280\r", + "Epoch 0: 2%| | 270/14932 [00:35<31:41, 7.71it/s, v_num=o30c, train/loss=3.090" ] }, { @@ -3217,7 +3645,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 13%[=> ] 218.55M 2.35MB/s eta 9m 36s " + "Epoch 0: 2%| | 271/14932 [00:35<31:38, 7.72it/s, v_num=o30c, train/loss=3.090\r", + "Epoch 0: 2%| | 271/14932 [00:35<31:38, 7.72it/s, v_num=o30c, train/loss=3.880" ] }, { @@ -3225,7 +3654,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 13%[=> ] 219.14M 2.35MB/s eta 9m 36s " + "Epoch 0: 2%| | 272/14932 [00:35<31:35, 7.73it/s, v_num=o30c, train/loss=3.880\r", + "Epoch 0: 2%| | 272/14932 [00:35<31:35, 7.73it/s, v_num=o30c, train/loss=3.840" ] }, { @@ -3233,7 +3663,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 13%[=> ] 219.72M 2.39MB/s eta 9m 36s " + "Epoch 0: 2%| | 273/14932 [00:35<31:31, 7.75it/s, v_num=o30c, train/loss=3.840\r", + "Epoch 0: 2%| | 273/14932 [00:35<31:31, 7.75it/s, v_num=o30c, train/loss=3.310" ] }, { @@ -3241,7 +3672,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 13%[=> ] 220.32M 2.38MB/s eta 9m 36s " + "Epoch 0: 2%| | 274/14932 [00:35<31:28, 7.76it/s, v_num=o30c, train/loss=3.310\r", + "Epoch 0: 2%| | 274/14932 [00:35<31:28, 7.76it/s, v_num=o30c, train/loss=2.080" ] }, { @@ -3249,7 +3681,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 13%[=> ] 220.93M 2.40MB/s eta 9m 36s " + "Epoch 0: 2%| | 275/14932 [00:35<31:25, 7.77it/s, v_num=o30c, train/loss=2.080\r", + "Epoch 0: 2%| | 275/14932 [00:35<31:25, 7.77it/s, v_num=o30c, train/loss=5.560" ] }, { @@ -3257,7 +3690,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 13%[=> ] 221.54M 2.39MB/s eta 9m 35s " + "Epoch 0: 2%| | 276/14932 [00:35<31:22, 7.79it/s, v_num=o30c, train/loss=5.560\r", + "Epoch 0: 2%| | 276/14932 [00:35<31:22, 7.79it/s, v_num=o30c, train/loss=3.440" ] }, { @@ -3265,7 +3699,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 13%[=> ] 222.14M 2.47MB/s eta 9m 35s " + "Epoch 0: 2%| | 277/14932 [00:35<31:22, 7.79it/s, v_num=o30c, train/loss=3.440\r", + "Epoch 0: 2%| | 277/14932 [00:35<31:22, 7.79it/s, v_num=o30c, train/loss=4.620" ] }, { @@ -3273,7 +3708,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 13%[=> ] 222.75M 2.42MB/s eta 9m 35s " + "Epoch 0: 2%| | 278/14932 [00:35<31:19, 7.80it/s, v_num=o30c, train/loss=4.620\r", + "Epoch 0: 2%| | 278/14932 [00:35<31:19, 7.80it/s, v_num=o30c, train/loss=3.390" ] }, { @@ -3281,7 +3717,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 13%[=> ] 223.39M 2.41MB/s eta 9m 35s " + "Epoch 0: 2%| | 279/14932 [00:35<31:16, 7.81it/s, v_num=o30c, train/loss=3.390\r", + "Epoch 0: 2%| | 279/14932 [00:35<31:16, 7.81it/s, v_num=o30c, train/loss=4.720" ] }, { @@ -3289,7 +3726,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 13%[=> ] 224.04M 2.46MB/s eta 9m 35s " + "Epoch 0: 2%| | 280/14932 [00:35<31:13, 7.82it/s, v_num=o30c, train/loss=4.720\r", + "Epoch 0: 2%| | 280/14932 [00:35<31:13, 7.82it/s, v_num=o30c, train/loss=4.060" ] }, { @@ -3297,7 +3735,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 13%[=> ] 224.68M 2.46MB/s eta 9m 33s " + "Epoch 0: 2%| | 281/14932 [00:35<31:10, 7.83it/s, v_num=o30c, train/loss=4.060\r", + "Epoch 0: 2%| | 281/14932 [00:35<31:10, 7.83it/s, v_num=o30c, train/loss=3.910" ] }, { @@ -3305,7 +3744,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 13%[=> ] 225.35M 2.57MB/s eta 9m 33s " + "Epoch 0: 2%| | 282/14932 [00:35<31:07, 7.85it/s, v_num=o30c, train/loss=3.910\r", + "Epoch 0: 2%| | 282/14932 [00:35<31:07, 7.85it/s, v_num=o30c, train/loss=1.540" ] }, { @@ -3313,7 +3753,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 13%[=> ] 226.02M 2.54MB/s eta 9m 33s " + "Epoch 0: 2%| | 283/14932 [00:36<31:04, 7.86it/s, v_num=o30c, train/loss=1.540\r", + "Epoch 0: 2%| | 283/14932 [00:36<31:04, 7.86it/s, v_num=o30c, train/loss=3.230" ] }, { @@ -3321,7 +3762,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 13%[=> ] 226.71M 2.56MB/s eta 9m 33s " + "Epoch 0: 2%| | 284/14932 [00:36<31:01, 7.87it/s, v_num=o30c, train/loss=3.230\r", + "Epoch 0: 2%| | 284/14932 [00:36<31:01, 7.87it/s, v_num=o30c, train/loss=3.360" ] }, { @@ -3329,7 +3771,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 13%[=> ] 227.41M 2.67MB/s eta 9m 33s " + "Epoch 0: 2%| | 285/14932 [00:36<30:57, 7.88it/s, v_num=o30c, train/loss=3.360\r", + "Epoch 0: 2%| | 285/14932 [00:36<30:57, 7.88it/s, v_num=o30c, train/loss=2.890" ] }, { @@ -3337,7 +3780,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 13%[=> ] 228.13M 2.65MB/s eta 9m 30s " + "Epoch 0: 2%| | 286/14932 [00:36<30:55, 7.89it/s, v_num=o30c, train/loss=2.890\r", + "Epoch 0: 2%| | 286/14932 [00:36<30:55, 7.89it/s, v_num=o30c, train/loss=2.920" ] }, { @@ -3345,7 +3789,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 13%[=> ] 228.47M 2.51MB/s eta 9m 30s " + "Epoch 0: 2%| | 287/14932 [00:36<30:52, 7.91it/s, v_num=o30c, train/loss=2.920\r", + "Epoch 0: 2%| | 287/14932 [00:36<30:52, 7.91it/s, v_num=o30c, train/loss=2.910" ] }, { @@ -3353,7 +3798,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 14%[=> ] 229.82M 2.80MB/s eta 9m 30s " + "Epoch 0: 2%| | 288/14932 [00:36<30:55, 7.89it/s, v_num=o30c, train/loss=2.910\r", + "Epoch 0: 2%| | 288/14932 [00:36<30:55, 7.89it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -3361,7 +3807,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 14%[=> ] 230.27M 2.74MB/s eta 9m 30s " + "Epoch 0: 2%| | 289/14932 [00:36<30:52, 7.91it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 2%| | 289/14932 [00:36<30:52, 7.91it/s, v_num=o30c, train/loss=3.000" ] }, { @@ -3369,7 +3816,7 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 14%[=> ] 230.57M 2.65MB/s eta 9m 30s " + "Epoch 0: 2%| | 290/14932 [00:36<30:49, 7.92it/s, v_num=o30c, train/loss=3.000" ] }, { @@ -3377,7 +3824,7 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 14%[=> ] 231.14M 2.67MB/s eta 9m 29s " + "Epoch 0: 2%| | 290/14932 [00:36<30:49, 7.92it/s, v_num=o30c, train/loss=3.410" ] }, { @@ -3385,7 +3832,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 14%[=> ] 231.74M 2.64MB/s eta 9m 29s " + "Epoch 0: 2%| | 291/14932 [00:36<30:46, 7.93it/s, v_num=o30c, train/loss=3.410\r", + "Epoch 0: 2%| | 291/14932 [00:36<30:46, 7.93it/s, v_num=o30c, train/loss=3.330" ] }, { @@ -3393,7 +3841,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 14%[=> ] 232.33M 2.70MB/s eta 9m 29s " + "Epoch 0: 2%| | 292/14932 [00:36<30:43, 7.94it/s, v_num=o30c, train/loss=3.330\r", + "Epoch 0: 2%| | 292/14932 [00:36<30:43, 7.94it/s, v_num=o30c, train/loss=3.020" ] }, { @@ -3401,7 +3850,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 14%[=> ] 232.96M 2.64MB/s eta 9m 29s " + "Epoch 0: 2%| | 293/14932 [00:36<30:40, 7.95it/s, v_num=o30c, train/loss=3.020\r", + "Epoch 0: 2%| | 293/14932 [00:36<30:40, 7.95it/s, v_num=o30c, train/loss=3.060" ] }, { @@ -3409,7 +3859,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 14%[=> ] 233.52M 2.56MB/s eta 9m 29s " + "Epoch 0: 2%| | 294/14932 [00:36<30:37, 7.96it/s, v_num=o30c, train/loss=3.060\r", + "Epoch 0: 2%| | 294/14932 [00:36<30:37, 7.96it/s, v_num=o30c, train/loss=3.410" ] }, { @@ -3417,7 +3868,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 14%[=> ] 234.64M 2.65MB/s eta 9m 29s " + "Epoch 0: 2%| | 295/14932 [00:36<30:35, 7.98it/s, v_num=o30c, train/loss=3.410\r", + "Epoch 0: 2%| | 295/14932 [00:36<30:35, 7.98it/s, v_num=o30c, train/loss=2.330" ] }, { @@ -3425,7 +3877,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 14%[=> ] 235.10M 2.59MB/s eta 9m 29s " + "Epoch 0: 2%| | 296/14932 [00:37<30:32, 7.99it/s, v_num=o30c, train/loss=2.330\r", + "Epoch 0: 2%| | 296/14932 [00:37<30:32, 7.99it/s, v_num=o30c, train/loss=2.050" ] }, { @@ -3433,7 +3886,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 14%[=> ] 235.55M 2.61MB/s eta 9m 29s " + "Epoch 0: 2%| | 297/14932 [00:37<30:29, 8.00it/s, v_num=o30c, train/loss=2.050\r", + "Epoch 0: 2%| | 297/14932 [00:37<30:29, 8.00it/s, v_num=o30c, train/loss=3.560" ] }, { @@ -3441,7 +3895,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 14%[=> ] 236.02M 2.52MB/s eta 9m 29s " + "Epoch 0: 2%| | 298/14932 [00:37<30:27, 8.01it/s, v_num=o30c, train/loss=3.560\r", + "Epoch 0: 2%| | 298/14932 [00:37<30:27, 8.01it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -3449,7 +3904,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 14%[=> ] 236.43M 2.45MB/s eta 9m 27s " + "Epoch 0: 2%| | 299/14932 [00:37<30:24, 8.02it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 2%| | 299/14932 [00:37<30:24, 8.02it/s, v_num=o30c, train/loss=3.890" ] }, { @@ -3457,7 +3913,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 14%[=> ] 236.89M 2.46MB/s eta 9m 27s " + "Epoch 0: 2%| | 300/14932 [00:37<30:22, 8.03it/s, v_num=o30c, train/loss=3.890\r", + "Epoch 0: 2%| | 300/14932 [00:37<30:22, 8.03it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -3465,7 +3922,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 14%[=> ] 237.08M 2.49MB/s eta 9m 27s " + "Epoch 0: 2%| | 301/14932 [00:37<30:19, 8.04it/s, v_num=o30c, train/loss=3.480\r", + "Epoch 0: 2%| | 301/14932 [00:37<30:19, 8.04it/s, v_num=o30c, train/loss=3.590" ] }, { @@ -3473,7 +3931,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 14%[=> ] 237.60M 2.29MB/s eta 9m 27s " + "Epoch 0: 2%| | 302/14932 [00:37<30:17, 8.05it/s, v_num=o30c, train/loss=3.590\r", + "Epoch 0: 2%| | 302/14932 [00:37<30:17, 8.05it/s, v_num=o30c, train/loss=4.340" ] }, { @@ -3481,7 +3940,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 14%[=> ] 238.10M 2.22MB/s eta 9m 27s " + "Epoch 0: 2%| | 303/14932 [00:37<30:14, 8.06it/s, v_num=o30c, train/loss=4.340\r", + "Epoch 0: 2%| | 303/14932 [00:37<30:14, 8.06it/s, v_num=o30c, train/loss=3.270" ] }, { @@ -3489,7 +3949,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 14%[=> ] 238.63M 2.24MB/s eta 9m 27s " + "Epoch 0: 2%| | 304/14932 [00:37<30:11, 8.07it/s, v_num=o30c, train/loss=3.270\r", + "Epoch 0: 2%| | 304/14932 [00:37<30:11, 8.07it/s, v_num=o30c, train/loss=3.220" ] }, { @@ -3497,7 +3958,8 @@ "output_type": "stream", "text": [ "\r", - "pth 14%[=> ] 239.14M 2.19MB/s eta 9m 27s " + "Epoch 0: 2%| | 305/14932 [00:37<30:12, 8.07it/s, v_num=o30c, train/loss=3.220\r", + "Epoch 0: 2%| | 305/14932 [00:37<30:12, 8.07it/s, v_num=o30c, train/loss=4.220" ] }, { @@ -3505,7 +3967,8 @@ "output_type": "stream", "text": [ "\r", - "th 14%[=> ] 239.68M 2.22MB/s eta 9m 27s " + "Epoch 0: 2%| | 306/14932 [00:37<30:15, 8.06it/s, v_num=o30c, train/loss=4.220\r", + "Epoch 0: 2%| | 306/14932 [00:37<30:15, 8.06it/s, v_num=o30c, train/loss=2.940" ] }, { @@ -3513,7 +3976,8 @@ "output_type": "stream", "text": [ "\r", - "h 14%[=> ] 240.22M 2.23MB/s eta 9m 27s " + "Epoch 0: 2%| | 307/14932 [00:38<30:13, 8.07it/s, v_num=o30c, train/loss=2.940\r", + "Epoch 0: 2%| | 307/14932 [00:38<30:13, 8.07it/s, v_num=o30c, train/loss=3.220" ] }, { @@ -3521,7 +3985,8 @@ "output_type": "stream", "text": [ "\r", - " 14%[=> ] 240.75M 2.12MB/s eta 9m 27s " + "Epoch 0: 2%| | 308/14932 [00:38<30:10, 8.08it/s, v_num=o30c, train/loss=3.220\r", + "Epoch 0: 2%| | 308/14932 [00:38<30:10, 8.08it/s, v_num=o30c, train/loss=2.380" ] }, { @@ -3529,7 +3994,8 @@ "output_type": "stream", "text": [ "\r", - " v 14%[=> ] 241.29M 2.32MB/s eta 9m 27s " + "Epoch 0: 2%| | 309/14932 [00:38<30:07, 8.09it/s, v_num=o30c, train/loss=2.380\r", + "Epoch 0: 2%| | 309/14932 [00:38<30:07, 8.09it/s, v_num=o30c, train/loss=3.340" ] }, { @@ -3537,7 +4003,8 @@ "output_type": "stream", "text": [ "\r", - " v5 14%[=> ] 241.83M 2.15MB/s eta 9m 27s " + "Epoch 0: 2%| | 310/14932 [00:38<30:05, 8.10it/s, v_num=o30c, train/loss=3.340\r", + "Epoch 0: 2%| | 310/14932 [00:38<30:05, 8.10it/s, v_num=o30c, train/loss=4.030" ] }, { @@ -3545,7 +4012,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 14%[=> ] 242.38M 2.17MB/s eta 9m 27s " + "Epoch 0: 2%| | 311/14932 [00:38<30:02, 8.11it/s, v_num=o30c, train/loss=4.030\r", + "Epoch 0: 2%| | 311/14932 [00:38<30:02, 8.11it/s, v_num=o30c, train/loss=1.920" ] }, { @@ -3553,7 +4021,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 14%[=> ] 242.93M 2.19MB/s eta 9m 27s " + "Epoch 0: 2%| | 312/14932 [00:38<30:03, 8.11it/s, v_num=o30c, train/loss=1.920\r", + "Epoch 0: 2%| | 312/14932 [00:38<30:03, 8.11it/s, v_num=o30c, train/loss=3.000" ] }, { @@ -3561,7 +4030,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 14%[=> ] 243.47M 2.21MB/s eta 9m 27s " + "Epoch 0: 2%| | 313/14932 [00:38<30:00, 8.12it/s, v_num=o30c, train/loss=3.000\r", + "Epoch 0: 2%| | 313/14932 [00:38<30:00, 8.12it/s, v_num=o30c, train/loss=4.500" ] }, { @@ -3569,7 +4039,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 14%[=> ] 244.02M 2.23MB/s eta 9m 26s " + "Epoch 0: 2%| | 314/14932 [00:38<29:58, 8.13it/s, v_num=o30c, train/loss=4.500\r", + "Epoch 0: 2%| | 314/14932 [00:38<29:58, 8.13it/s, v_num=o30c, train/loss=2.380" ] }, { @@ -3577,7 +4048,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 14%[=> ] 244.58M 2.23MB/s eta 9m 26s " + "Epoch 0: 2%| | 315/14932 [00:38<29:55, 8.14it/s, v_num=o30c, train/loss=2.380" ] }, { @@ -3585,7 +4056,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 14%[=> ] 245.13M 2.25MB/s eta 9m 26s " + "Epoch 0: 2%| | 315/14932 [00:38<29:55, 8.14it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -3593,7 +4064,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 14%[=> ] 245.68M 2.29MB/s eta 9m 26s " + "Epoch 0: 2%| | 316/14932 [00:38<29:56, 8.14it/s, v_num=o30c, train/loss=3.480\r", + "Epoch 0: 2%| | 316/14932 [00:38<29:56, 8.14it/s, v_num=o30c, train/loss=4.620" ] }, { @@ -3601,7 +4073,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 15%[==> ] 246.22M 2.25MB/s eta 9m 26s " + "Epoch 0: 2%| | 317/14932 [00:38<29:53, 8.15it/s, v_num=o30c, train/loss=4.620\r", + "Epoch 0: 2%| | 317/14932 [00:38<29:53, 8.15it/s, v_num=o30c, train/loss=2.920" ] }, { @@ -3609,7 +4082,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 15%[==> ] 246.77M 2.31MB/s eta 9m 25s " + "Epoch 0: 2%| | 318/14932 [00:38<29:50, 8.16it/s, v_num=o30c, train/loss=2.920\r", + "Epoch 0: 2%| | 318/14932 [00:38<29:50, 8.16it/s, v_num=o30c, train/loss=2.620" ] }, { @@ -3617,7 +4091,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 15%[==> ] 247.33M 2.25MB/s eta 9m 25s " + "Epoch 0: 2%| | 319/14932 [00:39<29:48, 8.17it/s, v_num=o30c, train/loss=2.620\r", + "Epoch 0: 2%| | 319/14932 [00:39<29:48, 8.17it/s, v_num=o30c, train/loss=2.830" ] }, { @@ -3625,7 +4100,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 15%[==> ] 247.88M 2.30MB/s eta 9m 25s " + "Epoch 0: 2%| | 320/14932 [00:39<29:51, 8.16it/s, v_num=o30c, train/loss=2.830\r", + "Epoch 0: 2%| | 320/14932 [00:39<29:51, 8.16it/s, v_num=o30c, train/loss=3.580" ] }, { @@ -3633,7 +4109,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 15%[==> ] 248.43M 2.27MB/s eta 9m 25s " + "Epoch 0: 2%| | 321/14932 [00:39<29:49, 8.17it/s, v_num=o30c, train/loss=3.580\r", + "Epoch 0: 2%| | 321/14932 [00:39<29:49, 8.17it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -3641,7 +4118,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 15%[==> ] 248.97M 2.30MB/s eta 9m 25s " + "Epoch 0: 2%| | 322/14932 [00:39<29:46, 8.18it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 2%| | 322/14932 [00:39<29:46, 8.18it/s, v_num=o30c, train/loss=3.000" ] }, { @@ -3649,7 +4127,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 15%[==> ] 249.52M 2.27MB/s eta 9m 25s " + "Epoch 0: 2%| | 323/14932 [00:39<29:44, 8.19it/s, v_num=o30c, train/loss=3.000\r", + "Epoch 0: 2%| | 323/14932 [00:39<29:44, 8.19it/s, v_num=o30c, train/loss=2.700" ] }, { @@ -3657,7 +4136,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 15%[==> ] 250.07M 2.30MB/s eta 9m 25s " + "Epoch 0: 2%| | 324/14932 [00:39<29:42, 8.20it/s, v_num=o30c, train/loss=2.700\r", + "Epoch 0: 2%| | 324/14932 [00:39<29:42, 8.20it/s, v_num=o30c, train/loss=3.750" ] }, { @@ -3665,7 +4145,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 15%[==> ] 250.61M 2.27MB/s eta 9m 25s " + "Epoch 0: 2%| | 325/14932 [00:39<29:39, 8.21it/s, v_num=o30c, train/loss=3.750\r", + "Epoch 0: 2%| | 325/14932 [00:39<29:39, 8.21it/s, v_num=o30c, train/loss=4.560" ] }, { @@ -3673,7 +4154,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 15%[==> ] 251.18M 2.30MB/s eta 9m 25s " + "Epoch 0: 2%| | 326/14932 [00:39<29:37, 8.22it/s, v_num=o30c, train/loss=4.560\r", + "Epoch 0: 2%| | 326/14932 [00:39<29:37, 8.22it/s, v_num=o30c, train/loss=3.700" ] }, { @@ -3681,7 +4163,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 15%[==> ] 251.72M 2.27MB/s eta 9m 25s " + "Epoch 0: 2%| | 327/14932 [00:39<29:35, 8.23it/s, v_num=o30c, train/loss=3.700\r", + "Epoch 0: 2%| | 327/14932 [00:39<29:35, 8.23it/s, v_num=o30c, train/loss=2.140" ] }, { @@ -3689,7 +4172,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 15%[==> ] 252.29M 2.31MB/s eta 9m 24s " + "Epoch 0: 2%| | 328/14932 [00:39<29:35, 8.22it/s, v_num=o30c, train/loss=2.140\r", + "Epoch 0: 2%| | 328/14932 [00:39<29:35, 8.22it/s, v_num=o30c, train/loss=3.940" ] }, { @@ -3697,7 +4181,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 15%[==> ] 252.83M 2.27MB/s eta 9m 24s " + "Epoch 0: 2%| | 329/14932 [00:40<29:36, 8.22it/s, v_num=o30c, train/loss=3.940\r", + "Epoch 0: 2%| | 329/14932 [00:40<29:36, 8.22it/s, v_num=o30c, train/loss=3.360" ] }, { @@ -3705,7 +4190,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 15%[==> ] 253.39M 2.31MB/s eta 9m 24s " + "Epoch 0: 2%| | 330/14932 [00:40<29:36, 8.22it/s, v_num=o30c, train/loss=3.360\r", + "Epoch 0: 2%| | 330/14932 [00:40<29:36, 8.22it/s, v_num=o30c, train/loss=4.250" ] }, { @@ -3713,7 +4199,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 15%[==> ] 253.96M 2.31MB/s eta 9m 24s " + "Epoch 0: 2%| | 331/14932 [00:40<29:34, 8.23it/s, v_num=o30c, train/loss=4.250\r", + "Epoch 0: 2%| | 331/14932 [00:40<29:34, 8.23it/s, v_num=o30c, train/loss=3.140" ] }, { @@ -3721,7 +4208,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 15%[==> ] 254.52M 2.32MB/s eta 9m 24s " + "Epoch 0: 2%| | 332/14932 [00:40<29:32, 8.24it/s, v_num=o30c, train/loss=3.140\r", + "Epoch 0: 2%| | 332/14932 [00:40<29:32, 8.24it/s, v_num=o30c, train/loss=3.020" ] }, { @@ -3729,7 +4217,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 15%[==> ] 255.10M 2.33MB/s eta 9m 23s " + "Epoch 0: 2%| | 333/14932 [00:40<29:29, 8.25it/s, v_num=o30c, train/loss=3.020\r", + "Epoch 0: 2%| | 333/14932 [00:40<29:29, 8.25it/s, v_num=o30c, train/loss=2.640" ] }, { @@ -3737,7 +4226,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 15%[==> ] 255.66M 2.33MB/s eta 9m 23s " + "Epoch 0: 2%| | 334/14932 [00:40<29:27, 8.26it/s, v_num=o30c, train/loss=2.640\r", + "Epoch 0: 2%| | 334/14932 [00:40<29:27, 8.26it/s, v_num=o30c, train/loss=2.880" ] }, { @@ -3745,7 +4235,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 15%[==> ] 256.25M 2.35MB/s eta 9m 23s " + "Epoch 0: 2%| | 335/14932 [00:40<29:28, 8.26it/s, v_num=o30c, train/loss=2.880\r", + "Epoch 0: 2%| | 335/14932 [00:40<29:28, 8.26it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -3753,7 +4244,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 15%[==> ] 256.83M 2.32MB/s eta 9m 23s " + "Epoch 0: 2%| | 336/14932 [00:40<29:25, 8.27it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 2%| | 336/14932 [00:40<29:25, 8.27it/s, v_num=o30c, train/loss=1.800" ] }, { @@ -3761,7 +4253,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 15%[==> ] 257.43M 2.36MB/s eta 9m 23s " + "Epoch 0: 2%| | 337/14932 [00:40<29:23, 8.28it/s, v_num=o30c, train/loss=1.800\r", + "Epoch 0: 2%| | 337/14932 [00:40<29:23, 8.28it/s, v_num=o30c, train/loss=3.470" ] }, { @@ -3769,7 +4262,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 15%[==> ] 258.02M 2.35MB/s eta 9m 22s " + "Epoch 0: 2%| | 338/14932 [00:40<29:21, 8.29it/s, v_num=o30c, train/loss=3.470\r", + "Epoch 0: 2%| | 338/14932 [00:40<29:21, 8.29it/s, v_num=o30c, train/loss=3.620" ] }, { @@ -3777,7 +4271,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 15%[==> ] 258.63M 2.43MB/s eta 9m 22s " + "Epoch 0: 2%| | 339/14932 [00:40<29:19, 8.30it/s, v_num=o30c, train/loss=3.620\r", + "Epoch 0: 2%| | 339/14932 [00:40<29:19, 8.30it/s, v_num=o30c, train/loss=4.310" ] }, { @@ -3785,7 +4280,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 15%[==> ] 259.24M 2.40MB/s eta 9m 22s " + "Epoch 0: 2%| | 340/14932 [00:40<29:16, 8.31it/s, v_num=o30c, train/loss=4.310\r", + "Epoch 0: 2%| | 340/14932 [00:40<29:16, 8.31it/s, v_num=o30c, train/loss=2.940" ] }, { @@ -3793,7 +4289,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 15%[==> ] 259.86M 2.47MB/s eta 9m 22s " + "Epoch 0: 2%| | 341/14932 [00:41<29:14, 8.31it/s, v_num=o30c, train/loss=2.940\r", + "Epoch 0: 2%| | 341/14932 [00:41<29:14, 8.31it/s, v_num=o30c, train/loss=3.860" ] }, { @@ -3801,7 +4298,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 15%[==> ] 260.50M 2.45MB/s eta 9m 22s " + "Epoch 0: 2%| | 342/14932 [00:41<29:12, 8.32it/s, v_num=o30c, train/loss=3.860\r", + "Epoch 0: 2%| | 342/14932 [00:41<29:12, 8.32it/s, v_num=o30c, train/loss=2.610" ] }, { @@ -3809,7 +4307,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 15%[==> ] 261.16M 2.52MB/s eta 9m 20s " + "Epoch 0: 2%| | 343/14932 [00:41<29:10, 8.33it/s, v_num=o30c, train/loss=2.610\r", + "Epoch 0: 2%| | 343/14932 [00:41<29:10, 8.33it/s, v_num=o30c, train/loss=3.620" ] }, { @@ -3817,7 +4316,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 15%[==> ] 261.82M 2.52MB/s eta 9m 20s " + "Epoch 0: 2%| | 344/14932 [00:41<29:08, 8.34it/s, v_num=o30c, train/loss=3.620\r", + "Epoch 0: 2%| | 344/14932 [00:41<29:08, 8.34it/s, v_num=o30c, train/loss=3.170" ] }, { @@ -3825,7 +4325,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 15%[==> ] 262.49M 2.56MB/s eta 9m 20s " + "Epoch 0: 2%| | 345/14932 [00:41<29:06, 8.35it/s, v_num=o30c, train/loss=3.170\r", + "Epoch 0: 2%| | 345/14932 [00:41<29:06, 8.35it/s, v_num=o30c, train/loss=3.830" ] }, { @@ -3833,7 +4334,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 16%[==> ] 263.18M 2.58MB/s eta 9m 20s " + "Epoch 0: 2%| | 346/14932 [00:41<29:04, 8.36it/s, v_num=o30c, train/loss=3.830\r", + "Epoch 0: 2%| | 346/14932 [00:41<29:04, 8.36it/s, v_num=o30c, train/loss=3.000" ] }, { @@ -3841,7 +4343,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 16%[==> ] 263.89M 2.63MB/s eta 9m 20s " + "Epoch 0: 2%| | 347/14932 [00:41<29:02, 8.37it/s, v_num=o30c, train/loss=3.000\r", + "Epoch 0: 2%| | 347/14932 [00:41<29:02, 8.37it/s, v_num=o30c, train/loss=2.970" ] }, { @@ -3849,7 +4352,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 16%[==> ] 264.61M 2.71MB/s eta 9m 17s " + "Epoch 0: 2%| | 348/14932 [00:41<28:59, 8.38it/s, v_num=o30c, train/loss=2.970\r", + "Epoch 0: 2%| | 348/14932 [00:41<28:59, 8.38it/s, v_num=o30c, train/loss=3.060" ] }, { @@ -3857,7 +4361,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 16%[==> ] 265.36M 2.71MB/s eta 9m 17s " + "Epoch 0: 2%| | 349/14932 [00:41<28:58, 8.39it/s, v_num=o30c, train/loss=3.060\r", + "Epoch 0: 2%| | 349/14932 [00:41<28:58, 8.39it/s, v_num=o30c, train/loss=4.530" ] }, { @@ -3865,7 +4370,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 16%[==> ] 266.13M 2.78MB/s eta 9m 17s " + "Epoch 0: 2%| | 350/14932 [00:41<28:55, 8.40it/s, v_num=o30c, train/loss=4.530\r", + "Epoch 0: 2%| | 350/14932 [00:41<28:55, 8.40it/s, v_num=o30c, train/loss=3.050" ] }, { @@ -3873,7 +4379,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 16%[==> ] 266.91M 2.82MB/s eta 9m 17s " + "Epoch 0: 2%| | 351/14932 [00:41<28:54, 8.41it/s, v_num=o30c, train/loss=3.050\r", + "Epoch 0: 2%| | 351/14932 [00:41<28:54, 8.41it/s, v_num=o30c, train/loss=4.090" ] }, { @@ -3881,7 +4388,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 16%[==> ] 267.72M 2.90MB/s eta 9m 17s " + "Epoch 0: 2%| | 352/14932 [00:41<28:56, 8.39it/s, v_num=o30c, train/loss=4.090\r", + "Epoch 0: 2%| | 352/14932 [00:41<28:56, 8.39it/s, v_num=o30c, train/loss=3.440" ] }, { @@ -3889,7 +4397,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 16%[==> ] 268.55M 2.95MB/s eta 9m 14s " + "Epoch 0: 2%| | 353/14932 [00:42<28:54, 8.40it/s, v_num=o30c, train/loss=3.440\r", + "Epoch 0: 2%| | 353/14932 [00:42<28:54, 8.40it/s, v_num=o30c, train/loss=4.000" ] }, { @@ -3897,7 +4406,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 16%[==> ] 269.41M 3.04MB/s eta 9m 14s " + "Epoch 0: 2%| | 354/14932 [00:42<28:52, 8.41it/s, v_num=o30c, train/loss=4.000\r", + "Epoch 0: 2%| | 354/14932 [00:42<28:52, 8.41it/s, v_num=o30c, train/loss=2.390" ] }, { @@ -3905,7 +4415,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 16%[==> ] 270.30M 3.10MB/s eta 9m 14s " + "Epoch 0: 2%| | 355/14932 [00:42<28:50, 8.42it/s, v_num=o30c, train/loss=2.390\r", + "Epoch 0: 2%| | 355/14932 [00:42<28:50, 8.42it/s, v_num=o30c, train/loss=2.620" ] }, { @@ -3913,7 +4424,8 @@ "output_type": "stream", "text": [ "\r", - "pth 16%[==> ] 271.21M 3.21MB/s eta 9m 14s " + "Epoch 0: 2%| | 356/14932 [00:42<28:48, 8.43it/s, v_num=o30c, train/loss=2.620\r", + "Epoch 0: 2%| | 356/14932 [00:42<28:48, 8.43it/s, v_num=o30c, train/loss=3.020" ] }, { @@ -3921,7 +4433,8 @@ "output_type": "stream", "text": [ "\r", - "th 16%[==> ] 272.16M 3.28MB/s eta 9m 14s " + "Epoch 0: 2%| | 357/14932 [00:42<28:46, 8.44it/s, v_num=o30c, train/loss=3.020\r", + "Epoch 0: 2%| | 357/14932 [00:42<28:46, 8.44it/s, v_num=o30c, train/loss=2.590" ] }, { @@ -3929,7 +4442,8 @@ "output_type": "stream", "text": [ "\r", - "h 16%[==> ] 273.13M 3.39MB/s eta 9m 8s " + "Epoch 0: 2%| | 358/14932 [00:42<28:47, 8.44it/s, v_num=o30c, train/loss=2.590\r", + "Epoch 0: 2%| | 358/14932 [00:42<28:47, 8.44it/s, v_num=o30c, train/loss=4.190" ] }, { @@ -3937,7 +4451,8 @@ "output_type": "stream", "text": [ "\r", - " 16%[==> ] 274.14M 3.47MB/s eta 9m 8s " + "Epoch 0: 2%| | 359/14932 [00:42<28:45, 8.45it/s, v_num=o30c, train/loss=4.190\r", + "Epoch 0: 2%| | 359/14932 [00:42<28:45, 8.45it/s, v_num=o30c, train/loss=4.690" ] }, { @@ -3945,7 +4460,8 @@ "output_type": "stream", "text": [ "\r", - " v 16%[==> ] 275.02M 3.36MB/s eta 9m 8s " + "Epoch 0: 2%| | 360/14932 [00:42<28:43, 8.45it/s, v_num=o30c, train/loss=4.690\r", + "Epoch 0: 2%| | 360/14932 [00:42<28:43, 8.45it/s, v_num=o30c, train/loss=4.030" ] }, { @@ -3953,7 +4469,8 @@ "output_type": "stream", "text": [ "\r", - " v5 16%[==> ] 276.94M 3.72MB/s eta 9m 8s " + "Epoch 0: 2%| | 361/14932 [00:42<28:41, 8.46it/s, v_num=o30c, train/loss=4.030\r", + "Epoch 0: 2%| | 361/14932 [00:42<28:41, 8.46it/s, v_num=o30c, train/loss=3.330" ] }, { @@ -3961,7 +4478,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 16%[==> ] 277.72M 3.73MB/s eta 9m 3s " + "Epoch 0: 2%| | 362/14932 [00:42<28:40, 8.47it/s, v_num=o30c, train/loss=3.330\r", + "Epoch 0: 2%| | 362/14932 [00:42<28:40, 8.47it/s, v_num=o30c, train/loss=1.760" ] }, { @@ -3969,7 +4487,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 16%[==> ] 278.54M 3.75MB/s eta 9m 3s " + "Epoch 0: 2%| | 363/14932 [00:42<28:38, 8.48it/s, v_num=o30c, train/loss=1.760\r", + "Epoch 0: 2%| | 363/14932 [00:42<28:38, 8.48it/s, v_num=o30c, train/loss=3.310" ] }, { @@ -3977,7 +4496,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 17%[==> ] 279.36M 3.76MB/s eta 9m 3s " + "Epoch 0: 2%| | 364/14932 [00:42<28:36, 8.49it/s, v_num=o30c, train/loss=3.310\r", + "Epoch 0: 2%| | 364/14932 [00:42<28:36, 8.49it/s, v_num=o30c, train/loss=2.750" ] }, { @@ -3985,7 +4505,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 17%[==> ] 280.21M 3.78MB/s eta 9m 3s " + "Epoch 0: 2%| | 365/14932 [00:42<28:34, 8.50it/s, v_num=o30c, train/loss=2.750\r", + "Epoch 0: 2%| | 365/14932 [00:42<28:34, 8.50it/s, v_num=o30c, train/loss=3.060" ] }, { @@ -3993,7 +4514,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 17%[==> ] 281.08M 3.79MB/s eta 9m 3s " + "Epoch 0: 2%| | 366/14932 [00:43<28:32, 8.50it/s, v_num=o30c, train/loss=3.060\r", + "Epoch 0: 2%| | 366/14932 [00:43<28:32, 8.50it/s, v_num=o30c, train/loss=3.170" ] }, { @@ -4001,7 +4523,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 17%[==> ] 281.97M 3.82MB/s eta 8m 59s " + "Epoch 0: 2%| | 367/14932 [00:43<28:30, 8.51it/s, v_num=o30c, train/loss=3.170\r", + "Epoch 0: 2%| | 367/14932 [00:43<28:30, 8.51it/s, v_num=o30c, train/loss=4.220" ] }, { @@ -4009,7 +4532,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 17%[==> ] 282.75M 3.54MB/s eta 8m 59s " + "Epoch 0: 2%| | 368/14932 [00:43<28:28, 8.52it/s, v_num=o30c, train/loss=4.220\r", + "Epoch 0: 2%| | 368/14932 [00:43<28:29, 8.52it/s, v_num=o30c, train/loss=4.120" ] }, { @@ -4017,7 +4541,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 17%[==> ] 284.44M 3.77MB/s eta 8m 59s " + "Epoch 0: 2%| | 369/14932 [00:43<28:27, 8.53it/s, v_num=o30c, train/loss=4.120\r", + "Epoch 0: 2%| | 369/14932 [00:43<28:27, 8.53it/s, v_num=o30c, train/loss=3.910" ] }, { @@ -4025,7 +4550,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 17%[==> ] 285.10M 3.68MB/s eta 8m 59s " + "Epoch 0: 2%| | 370/14932 [00:43<28:25, 8.54it/s, v_num=o30c, train/loss=3.910\r", + "Epoch 0: 2%| | 370/14932 [00:43<28:25, 8.54it/s, v_num=o30c, train/loss=3.920" ] }, { @@ -4033,7 +4559,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 17%[==> ] 285.43M 3.45MB/s eta 8m 57s " + "Epoch 0: 2%| | 371/14932 [00:43<28:23, 8.55it/s, v_num=o30c, train/loss=3.920\r", + "Epoch 0: 2%| | 371/14932 [00:43<28:23, 8.55it/s, v_num=o30c, train/loss=3.410" ] }, { @@ -4041,7 +4568,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 17%[==> ] 286.54M 3.47MB/s eta 8m 57s " + "Epoch 0: 2%| | 372/14932 [00:43<28:22, 8.55it/s, v_num=o30c, train/loss=3.410\r", + "Epoch 0: 2%| | 372/14932 [00:43<28:22, 8.55it/s, v_num=o30c, train/loss=2.610" ] }, { @@ -4049,7 +4577,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 17%[==> ] 287.19M 3.45MB/s eta 8m 57s " + "Epoch 0: 2%| | 373/14932 [00:43<28:20, 8.56it/s, v_num=o30c, train/loss=2.610\r", + "Epoch 0: 2%| | 373/14932 [00:43<28:20, 8.56it/s, v_num=o30c, train/loss=2.720" ] }, { @@ -4057,7 +4586,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 17%[==> ] 287.89M 3.41MB/s eta 8m 57s " + "Epoch 0: 3%| | 374/14932 [00:43<28:18, 8.57it/s, v_num=o30c, train/loss=2.720\r", + "Epoch 0: 3%| | 374/14932 [00:43<28:18, 8.57it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -4065,7 +4595,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 17%[==> ] 288.61M 3.21MB/s eta 8m 57s " + "Epoch 0: 3%| | 375/14932 [00:43<28:16, 8.58it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 3%| | 375/14932 [00:43<28:16, 8.58it/s, v_num=o30c, train/loss=2.810" ] }, { @@ -4073,7 +4604,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 17%[==> ] 289.35M 3.30MB/s eta 8m 53s " + "Epoch 0: 3%| | 376/14932 [00:43<28:14, 8.59it/s, v_num=o30c, train/loss=2.810\r", + "Epoch 0: 3%| | 376/14932 [00:43<28:14, 8.59it/s, v_num=o30c, train/loss=4.840" ] }, { @@ -4081,7 +4613,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 17%[==> ] 290.08M 3.24MB/s eta 8m 53s " + "Epoch 0: 3%| | 377/14932 [00:43<28:12, 8.60it/s, v_num=o30c, train/loss=4.840\r", + "Epoch 0: 3%| | 377/14932 [00:43<28:12, 8.60it/s, v_num=o30c, train/loss=3.470" ] }, { @@ -4089,7 +4622,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 17%[==> ] 290.83M 3.27MB/s eta 8m 53s " + "Epoch 0: 3%| | 378/14932 [00:43<28:11, 8.61it/s, v_num=o30c, train/loss=3.470\r", + "Epoch 0: 3%| | 378/14932 [00:43<28:11, 8.61it/s, v_num=o30c, train/loss=3.020" ] }, { @@ -4097,7 +4631,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 17%[==> ] 291.58M 3.21MB/s eta 8m 53s " + "Epoch 0: 3%| | 379/14932 [00:43<28:09, 8.61it/s, v_num=o30c, train/loss=3.020\r", + "Epoch 0: 3%| | 379/14932 [00:43<28:09, 8.61it/s, v_num=o30c, train/loss=3.220" ] }, { @@ -4105,7 +4640,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 17%[==> ] 292.35M 3.14MB/s eta 8m 53s " + "Epoch 0: 3%| | 380/14932 [00:44<28:07, 8.62it/s, v_num=o30c, train/loss=3.220\r", + "Epoch 0: 3%| | 380/14932 [00:44<28:07, 8.62it/s, v_num=o30c, train/loss=4.190" ] }, { @@ -4113,7 +4649,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 17%[==> ] 293.11M 3.16MB/s eta 8m 51s " + "Epoch 0: 3%| | 381/14932 [00:44<28:05, 8.63it/s, v_num=o30c, train/loss=4.190\r", + "Epoch 0: 3%| | 381/14932 [00:44<28:05, 8.63it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -4121,7 +4658,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 17%[==> ] 293.88M 3.09MB/s eta 8m 51s " + "Epoch 0: 3%| | 382/14932 [00:44<28:06, 8.63it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 3%| | 382/14932 [00:44<28:06, 8.63it/s, v_num=o30c, train/loss=2.950" ] }, { @@ -4129,7 +4667,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 17%[==> ] 294.66M 3.16MB/s eta 8m 51s " + "Epoch 0: 3%| | 383/14932 [00:45<28:43, 8.44it/s, v_num=o30c, train/loss=2.950\r", + "Epoch 0: 3%| | 383/14932 [00:45<28:43, 8.44it/s, v_num=o30c, train/loss=3.300" ] }, { @@ -4137,7 +4676,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 17%[==> ] 295.43M 3.05MB/s eta 8m 51s " + "Epoch 0: 3%| | 384/14932 [00:45<28:45, 8.43it/s, v_num=o30c, train/loss=3.300\r", + "Epoch 0: 3%| | 384/14932 [00:45<28:45, 8.43it/s, v_num=o30c, train/loss=3.450" ] }, { @@ -4145,7 +4685,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 18%[==> ] 296.21M 3.14MB/s eta 8m 51s " + "Epoch 0: 3%| | 385/14932 [00:45<28:43, 8.44it/s, v_num=o30c, train/loss=3.450\r", + "Epoch 0: 3%| | 385/14932 [00:45<28:43, 8.44it/s, v_num=o30c, train/loss=5.000" ] }, { @@ -4153,7 +4694,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 18%[==> ] 296.99M 3.34MB/s eta 8m 47s " + "Epoch 0: 3%| | 386/14932 [00:45<28:41, 8.45it/s, v_num=o30c, train/loss=5.000\r", + "Epoch 0: 3%| | 386/14932 [00:45<28:41, 8.45it/s, v_num=o30c, train/loss=3.270" ] }, { @@ -4161,7 +4703,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 18%[==> ] 297.77M 3.23MB/s eta 8m 47s " + "Epoch 0: 3%| | 387/14932 [00:45<28:39, 8.46it/s, v_num=o30c, train/loss=3.270\r", + "Epoch 0: 3%| | 387/14932 [00:45<28:39, 8.46it/s, v_num=o30c, train/loss=3.610" ] }, { @@ -4169,7 +4712,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 18%[==> ] 298.50M 3.22MB/s eta 8m 47s " + "Epoch 0: 3%| | 388/14932 [00:45<28:38, 8.47it/s, v_num=o30c, train/loss=3.610\r", + "Epoch 0: 3%| | 388/14932 [00:45<28:38, 8.47it/s, v_num=o30c, train/loss=2.920" ] }, { @@ -4177,7 +4721,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 18%[==> ] 299.30M 3.30MB/s eta 8m 47s " + "Epoch 0: 3%| | 389/14932 [00:45<28:36, 8.47it/s, v_num=o30c, train/loss=2.920\r", + "Epoch 0: 3%| | 389/14932 [00:45<28:36, 8.47it/s, v_num=o30c, train/loss=3.590" ] }, { @@ -4185,7 +4730,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 18%[==> ] 300.08M 3.30MB/s eta 8m 47s " + "Epoch 0: 3%| | 390/14932 [00:46<28:43, 8.44it/s, v_num=o30c, train/loss=3.590\r", + "Epoch 0: 3%| | 390/14932 [00:46<28:43, 8.44it/s, v_num=o30c, train/loss=2.640" ] }, { @@ -4193,7 +4739,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 18%[==> ] 300.86M 3.27MB/s eta 8m 44s " + "Epoch 0: 3%| | 391/14932 [00:46<28:41, 8.45it/s, v_num=o30c, train/loss=2.640\r", + "Epoch 0: 3%| | 391/14932 [00:46<28:41, 8.45it/s, v_num=o30c, train/loss=1.280" ] }, { @@ -4201,7 +4748,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 18%[==> ] 301.66M 3.36MB/s eta 8m 44s " + "Epoch 0: 3%| | 392/14932 [00:46<28:39, 8.45it/s, v_num=o30c, train/loss=1.280\r", + "Epoch 0: 3%| | 392/14932 [00:46<28:39, 8.45it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -4209,7 +4757,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 18%[==> ] 302.32M 3.33MB/s eta 8m 44s " + "Epoch 0: 3%| | 393/14932 [00:46<28:37, 8.46it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 3%| | 393/14932 [00:46<28:37, 8.46it/s, v_num=o30c, train/loss=3.310" ] }, { @@ -4217,7 +4766,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 18%[==> ] 303.11M 3.30MB/s eta 8m 44s " + "Epoch 0: 3%| | 394/14932 [00:46<28:35, 8.47it/s, v_num=o30c, train/loss=3.310\r", + "Epoch 0: 3%| | 394/14932 [00:46<28:36, 8.47it/s, v_num=o30c, train/loss=2.450" ] }, { @@ -4225,7 +4775,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 18%[==> ] 303.89M 3.27MB/s eta 8m 44s " + "Epoch 0: 3%| | 395/14932 [00:46<28:34, 8.48it/s, v_num=o30c, train/loss=2.450\r", + "Epoch 0: 3%| | 395/14932 [00:46<28:34, 8.48it/s, v_num=o30c, train/loss=3.140" ] }, { @@ -4233,7 +4784,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 18%[==> ] 304.69M 3.36MB/s eta 8m 41s " + "Epoch 0: 3%| | 396/14932 [00:46<28:32, 8.49it/s, v_num=o30c, train/loss=3.140\r", + "Epoch 0: 3%| | 396/14932 [00:46<28:32, 8.49it/s, v_num=o30c, train/loss=3.550" ] }, { @@ -4241,7 +4793,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 18%[==> ] 305.47M 3.32MB/s eta 8m 41s " + "Epoch 0: 3%| | 397/14932 [00:46<28:30, 8.50it/s, v_num=o30c, train/loss=3.550\r", + "Epoch 0: 3%| | 397/14932 [00:46<28:30, 8.50it/s, v_num=o30c, train/loss=2.770" ] }, { @@ -4249,7 +4802,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 18%[==> ] 306.25M 3.32MB/s eta 8m 41s " + "Epoch 0: 3%| | 398/14932 [00:46<28:29, 8.50it/s, v_num=o30c, train/loss=2.770\r", + "Epoch 0: 3%| | 398/14932 [00:46<28:29, 8.50it/s, v_num=o30c, train/loss=4.250" ] }, { @@ -4257,7 +4811,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 18%[==> ] 307.05M 3.36MB/s eta 8m 41s " + "Epoch 0: 3%| | 399/14932 [00:46<28:27, 8.51it/s, v_num=o30c, train/loss=4.250\r", + "Epoch 0: 3%| | 399/14932 [00:46<28:27, 8.51it/s, v_num=o30c, train/loss=4.310" ] }, { @@ -4265,7 +4820,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 18%[==> ] 307.83M 3.33MB/s eta 8m 41s " + "Epoch 0: 3%| | 400/14932 [00:46<28:25, 8.52it/s, v_num=o30c, train/loss=4.310\r", + "Epoch 0: 3%| | 400/14932 [00:46<28:26, 8.52it/s, v_num=o30c, train/loss=3.910" ] }, { @@ -4273,7 +4829,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 18%[==> ] 308.61M 3.41MB/s eta 8m 38s " + "Epoch 0: 3%| | 401/14932 [00:47<28:24, 8.53it/s, v_num=o30c, train/loss=3.910\r", + "Epoch 0: 3%| | 401/14932 [00:47<28:24, 8.53it/s, v_num=o30c, train/loss=3.020" ] }, { @@ -4281,7 +4838,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 18%[==> ] 309.41M 3.33MB/s eta 8m 38s " + "Epoch 0: 3%| | 402/14932 [00:47<28:22, 8.53it/s, v_num=o30c, train/loss=3.020\r", + "Epoch 0: 3%| | 402/14932 [00:47<28:22, 8.53it/s, v_num=o30c, train/loss=4.310" ] }, { @@ -4289,7 +4847,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 18%[==> ] 310.21M 3.40MB/s eta 8m 38s " + "Epoch 0: 3%| | 403/14932 [00:47<28:21, 8.54it/s, v_num=o30c, train/loss=4.310\r", + "Epoch 0: 3%| | 403/14932 [00:47<28:21, 8.54it/s, v_num=o30c, train/loss=2.950" ] }, { @@ -4297,7 +4856,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 18%[==> ] 310.99M 3.39MB/s eta 8m 38s " + "Epoch 0: 3%| | 404/14932 [00:47<28:19, 8.55it/s, v_num=o30c, train/loss=2.950\r", + "Epoch 0: 3%| | 404/14932 [00:47<28:19, 8.55it/s, v_num=o30c, train/loss=2.970" ] }, { @@ -4305,7 +4865,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 18%[==> ] 311.29M 3.01MB/s eta 8m 38s " + "Epoch 0: 3%| | 405/14932 [00:47<28:17, 8.56it/s, v_num=o30c, train/loss=2.970\r", + "Epoch 0: 3%| | 405/14932 [00:47<28:17, 8.56it/s, v_num=o30c, train/loss=3.170" ] }, { @@ -4313,7 +4874,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 19%[==> ] 313.05M 3.33MB/s eta 8m 38s " + "Epoch 0: 3%| | 406/14932 [00:47<28:15, 8.56it/s, v_num=o30c, train/loss=3.170\r", + "Epoch 0: 3%| | 406/14932 [00:47<28:15, 8.56it/s, v_num=o30c, train/loss=4.030" ] }, { @@ -4321,7 +4883,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 19%[==> ] 313.63M 3.26MB/s eta 8m 38s " + "Epoch 0: 3%| | 407/14932 [00:47<28:16, 8.56it/s, v_num=o30c, train/loss=4.030\r", + "Epoch 0: 3%| | 407/14932 [00:47<28:16, 8.56it/s, v_num=o30c, train/loss=4.660" ] }, { @@ -4329,7 +4892,8 @@ "output_type": "stream", "text": [ "\r", - "pth 19%[==> ] 314.24M 3.13MB/s eta 8m 38s " + "Epoch 0: 3%| | 408/14932 [00:47<28:14, 8.57it/s, v_num=o30c, train/loss=4.660\r", + "Epoch 0: 3%| | 408/14932 [00:47<28:14, 8.57it/s, v_num=o30c, train/loss=2.440" ] }, { @@ -4337,7 +4901,8 @@ "output_type": "stream", "text": [ "\r", - "th 19%[==> ] 314.85M 3.17MB/s eta 8m 38s " + "Epoch 0: 3%| | 409/14932 [00:47<28:12, 8.58it/s, v_num=o30c, train/loss=2.440\r", + "Epoch 0: 3%| | 409/14932 [00:47<28:12, 8.58it/s, v_num=o30c, train/loss=3.610" ] }, { @@ -4345,7 +4910,8 @@ "output_type": "stream", "text": [ "\r", - "h 19%[==> ] 315.49M 3.12MB/s eta 8m 34s " + "Epoch 0: 3%| | 410/14932 [00:47<28:11, 8.59it/s, v_num=o30c, train/loss=3.610\r", + "Epoch 0: 3%| | 410/14932 [00:47<28:11, 8.59it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -4353,7 +4919,8 @@ "output_type": "stream", "text": [ "\r", - " 19%[==> ] 316.14M 3.04MB/s eta 8m 34s " + "Epoch 0: 3%| | 411/14932 [00:47<28:09, 8.59it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 3%| | 411/14932 [00:47<28:09, 8.59it/s, v_num=o30c, train/loss=4.000" ] }, { @@ -4361,7 +4928,8 @@ "output_type": "stream", "text": [ "\r", - " v 19%[==> ] 316.82M 3.07MB/s eta 8m 34s " + "Epoch 0: 3%| | 412/14932 [00:47<28:07, 8.60it/s, v_num=o30c, train/loss=4.000\r", + "Epoch 0: 3%| | 412/14932 [00:47<28:07, 8.60it/s, v_num=o30c, train/loss=3.950" ] }, { @@ -4369,7 +4937,8 @@ "output_type": "stream", "text": [ "\r", - " v5 19%[==> ] 317.49M 3.04MB/s eta 8m 34s " + "Epoch 0: 3%| | 413/14932 [00:47<28:06, 8.61it/s, v_num=o30c, train/loss=3.950\r", + "Epoch 0: 3%| | 413/14932 [00:47<28:06, 8.61it/s, v_num=o30c, train/loss=2.480" ] }, { @@ -4377,7 +4946,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 19%[==> ] 318.19M 2.96MB/s eta 8m 34s " + "Epoch 0: 3%| | 414/14932 [00:48<28:04, 8.62it/s, v_num=o30c, train/loss=2.480\r", + "Epoch 0: 3%| | 414/14932 [00:48<28:04, 8.62it/s, v_num=o30c, train/loss=4.190" ] }, { @@ -4385,7 +4955,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 19%[==> ] 318.89M 3.00MB/s eta 8m 32s " + "Epoch 0: 3%| | 415/14932 [00:48<28:02, 8.63it/s, v_num=o30c, train/loss=4.190\r", + "Epoch 0: 3%| | 415/14932 [00:48<28:02, 8.63it/s, v_num=o30c, train/loss=3.970" ] }, { @@ -4393,7 +4964,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 19%[==> ] 319.61M 2.92MB/s eta 8m 32s " + "Epoch 0: 3%| | 416/14932 [00:48<28:05, 8.61it/s, v_num=o30c, train/loss=3.970\r", + "Epoch 0: 3%| | 416/14932 [00:48<28:05, 8.61it/s, v_num=o30c, train/loss=4.470" ] }, { @@ -4401,7 +4973,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 19%[==> ] 320.35M 2.90MB/s eta 8m 32s " + "Epoch 0: 3%| | 417/14932 [00:48<28:03, 8.62it/s, v_num=o30c, train/loss=4.470\r", + "Epoch 0: 3%| | 417/14932 [00:48<28:03, 8.62it/s, v_num=o30c, train/loss=4.530" ] }, { @@ -4409,7 +4982,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 19%[==> ] 321.08M 2.91MB/s eta 8m 32s " + "Epoch 0: 3%| | 418/14932 [00:48<28:01, 8.63it/s, v_num=o30c, train/loss=4.530\r", + "Epoch 0: 3%| | 418/14932 [00:48<28:01, 8.63it/s, v_num=o30c, train/loss=3.810" ] }, { @@ -4417,7 +4991,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 19%[==> ] 321.82M 2.87MB/s eta 8m 32s " + "Epoch 0: 3%| | 419/14932 [00:48<28:00, 8.64it/s, v_num=o30c, train/loss=3.810\r", + "Epoch 0: 3%| | 419/14932 [00:48<28:00, 8.64it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -4425,7 +5000,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 19%[==> ] 322.55M 3.22MB/s eta 8m 30s " + "Epoch 0: 3%| | 420/14932 [00:48<27:58, 8.65it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 3%| | 420/14932 [00:48<27:58, 8.65it/s, v_num=o30c, train/loss=3.890" ] }, { @@ -4433,7 +5009,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 19%[==> ] 322.86M 2.87MB/s eta 8m 30s " + "Epoch 0: 3%| | 421/14932 [00:48<27:56, 8.65it/s, v_num=o30c, train/loss=3.890\r", + "Epoch 0: 3%| | 421/14932 [00:48<27:56, 8.65it/s, v_num=o30c, train/loss=3.380" ] }, { @@ -4441,7 +5018,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 19%[==> ] 323.57M 2.90MB/s eta 8m 30s " + "Epoch 0: 3%| | 422/14932 [00:48<27:55, 8.66it/s, v_num=o30c, train/loss=3.380\r", + "Epoch 0: 3%| | 422/14932 [00:48<27:55, 8.66it/s, v_num=o30c, train/loss=3.530" ] }, { @@ -4449,7 +5027,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 19%[==> ] 324.27M 2.94MB/s eta 8m 30s " + "Epoch 0: 3%| | 423/14932 [00:48<27:53, 8.67it/s, v_num=o30c, train/loss=3.530\r", + "Epoch 0: 3%| | 423/14932 [00:48<27:53, 8.67it/s, v_num=o30c, train/loss=3.140" ] }, { @@ -4457,7 +5036,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 19%[==> ] 325.04M 3.03MB/s eta 8m 30s " + "Epoch 0: 3%| | 424/14932 [00:48<27:52, 8.68it/s, v_num=o30c, train/loss=3.140\r", + "Epoch 0: 3%| | 424/14932 [00:48<27:52, 8.68it/s, v_num=o30c, train/loss=2.250" ] }, { @@ -4465,7 +5045,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 19%[==> ] 325.82M 3.04MB/s eta 8m 28s " + "Epoch 0: 3%| | 425/14932 [00:48<27:50, 8.69it/s, v_num=o30c, train/loss=2.250\r", + "Epoch 0: 3%| | 425/14932 [00:48<27:50, 8.69it/s, v_num=o30c, train/loss=3.410" ] }, { @@ -4473,7 +5054,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 19%[==> ] 326.60M 3.08MB/s eta 8m 28s " + "Epoch 0: 3%| | 426/14932 [00:49<27:48, 8.69it/s, v_num=o30c, train/loss=3.410\r", + "Epoch 0: 3%| | 426/14932 [00:49<27:48, 8.69it/s, v_num=o30c, train/loss=3.140" ] }, { @@ -4481,7 +5063,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 19%[==> ] 327.38M 3.15MB/s eta 8m 28s " + "Epoch 0: 3%| | 427/14932 [00:49<27:47, 8.70it/s, v_num=o30c, train/loss=3.140\r", + "Epoch 0: 3%| | 427/14932 [00:49<27:47, 8.70it/s, v_num=o30c, train/loss=1.480" ] }, { @@ -4489,7 +5072,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 19%[==> ] 328.16M 3.14MB/s eta 8m 28s " + "Epoch 0: 3%| | 428/14932 [00:49<27:45, 8.71it/s, v_num=o30c, train/loss=1.480\r", + "Epoch 0: 3%| | 428/14932 [00:49<27:45, 8.71it/s, v_num=o30c, train/loss=2.470" ] }, { @@ -4497,7 +5081,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 20%[===> ] 328.96M 3.18MB/s eta 8m 28s " + "Epoch 0: 3%| | 429/14932 [00:49<27:44, 8.71it/s, v_num=o30c, train/loss=2.470\r", + "Epoch 0: 3%| | 429/14932 [00:49<27:44, 8.71it/s, v_num=o30c, train/loss=3.830" ] }, { @@ -4505,7 +5090,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 20%[===> ] 329.74M 3.23MB/s eta 8m 25s " + "Epoch 0: 3%| | 430/14932 [00:49<27:42, 8.72it/s, v_num=o30c, train/loss=3.830\r", + "Epoch 0: 3%| | 430/14932 [00:49<27:42, 8.72it/s, v_num=o30c, train/loss=3.220" ] }, { @@ -4513,7 +5099,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 20%[===> ] 330.54M 3.21MB/s eta 8m 25s " + "Epoch 0: 3%| | 431/14932 [00:49<27:41, 8.73it/s, v_num=o30c, train/loss=3.220\r", + "Epoch 0: 3%| | 431/14932 [00:49<27:41, 8.73it/s, v_num=o30c, train/loss=4.090" ] }, { @@ -4521,7 +5108,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 20%[===> ] 331.33M 3.28MB/s eta 8m 25s " + "Epoch 0: 3%| | 432/14932 [00:49<27:39, 8.74it/s, v_num=o30c, train/loss=4.090\r", + "Epoch 0: 3%| | 432/14932 [00:49<27:39, 8.74it/s, v_num=o30c, train/loss=3.340" ] }, { @@ -4529,7 +5117,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 20%[===> ] 332.13M 3.26MB/s eta 8m 25s " + "Epoch 0: 3%| | 433/14932 [00:49<27:38, 8.74it/s, v_num=o30c, train/loss=3.340\r", + "Epoch 0: 3%| | 433/14932 [00:49<27:38, 8.74it/s, v_num=o30c, train/loss=2.440" ] }, { @@ -4537,7 +5126,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 20%[===> ] 332.93M 3.33MB/s eta 8m 25s " + "Epoch 0: 3%| | 434/14932 [00:49<27:36, 8.75it/s, v_num=o30c, train/loss=2.440\r", + "Epoch 0: 3%| | 434/14932 [00:49<27:36, 8.75it/s, v_num=o30c, train/loss=3.050" ] }, { @@ -4545,7 +5135,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 20%[===> ] 333.71M 3.32MB/s eta 8m 22s " + "Epoch 0: 3%| | 435/14932 [00:49<27:35, 8.76it/s, v_num=o30c, train/loss=3.050\r", + "Epoch 0: 3%| | 435/14932 [00:49<27:35, 8.76it/s, v_num=o30c, train/loss=2.550" ] }, { @@ -4553,7 +5144,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 20%[===> ] 334.50M 3.30MB/s eta 8m 22s " + "Epoch 0: 3%| | 436/14932 [00:49<27:33, 8.77it/s, v_num=o30c, train/loss=2.550\r", + "Epoch 0: 3%| | 436/14932 [00:49<27:33, 8.76it/s, v_num=o30c, train/loss=3.310" ] }, { @@ -4561,7 +5153,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 20%[===> ] 335.30M 3.35MB/s eta 8m 22s " + "Epoch 0: 3%| | 437/14932 [00:49<27:32, 8.77it/s, v_num=o30c, train/loss=3.310\r", + "Epoch 0: 3%| | 437/14932 [00:49<27:32, 8.77it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -4569,7 +5162,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 20%[===> ] 336.10M 3.29MB/s eta 8m 22s " + "Epoch 0: 3%| | 438/14932 [00:49<27:30, 8.78it/s, v_num=o30c, train/loss=3.480\r", + "Epoch 0: 3%| | 438/14932 [00:49<27:30, 8.78it/s, v_num=o30c, train/loss=2.590" ] }, { @@ -4577,7 +5171,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 20%[===> ] 336.72M 3.15MB/s eta 8m 21s " + "Epoch 0: 3%| | 439/14932 [00:49<27:29, 8.79it/s, v_num=o30c, train/loss=2.590\r", + "Epoch 0: 3%| | 439/14932 [00:49<27:29, 8.79it/s, v_num=o30c, train/loss=2.470" ] }, { @@ -4585,7 +5180,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 20%[===> ] 338.19M 3.34MB/s eta 8m 21s " + "Epoch 0: 3%| | 440/14932 [00:50<27:27, 8.79it/s, v_num=o30c, train/loss=2.470\r", + "Epoch 0: 3%| | 440/14932 [00:50<27:27, 8.79it/s, v_num=o30c, train/loss=2.660" ] }, { @@ -4593,7 +5189,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 20%[===> ] 338.69M 3.25MB/s eta 8m 21s " + "Epoch 0: 3%| | 441/14932 [00:50<27:26, 8.80it/s, v_num=o30c, train/loss=2.660\r", + "Epoch 0: 3%| | 441/14932 [00:50<27:26, 8.80it/s, v_num=o30c, train/loss=4.970" ] }, { @@ -4601,7 +5198,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 20%[===> ] 339.29M 3.21MB/s eta 8m 21s " + "Epoch 0: 3%| | 442/14932 [00:50<27:25, 8.81it/s, v_num=o30c, train/loss=4.970\r", + "Epoch 0: 3%| | 442/14932 [00:50<27:25, 8.81it/s, v_num=o30c, train/loss=3.550" ] }, { @@ -4609,7 +5207,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 20%[===> ] 339.89M 3.20MB/s eta 8m 21s " + "Epoch 0: 3%| | 443/14932 [00:50<27:23, 8.82it/s, v_num=o30c, train/loss=3.550\r", + "Epoch 0: 3%| | 443/14932 [00:50<27:23, 8.82it/s, v_num=o30c, train/loss=4.000" ] }, { @@ -4617,7 +5216,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 20%[===> ] 340.52M 3.12MB/s eta 8m 18s " + "Epoch 0: 3%| | 444/14932 [00:50<27:22, 8.82it/s, v_num=o30c, train/loss=4.000\r", + "Epoch 0: 3%| | 444/14932 [00:50<27:22, 8.82it/s, v_num=o30c, train/loss=2.950" ] }, { @@ -4625,7 +5225,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 20%[===> ] 341.18M 3.12MB/s eta 8m 18s " + "Epoch 0: 3%| | 445/14932 [00:50<27:20, 8.83it/s, v_num=o30c, train/loss=2.950\r", + "Epoch 0: 3%| | 445/14932 [00:50<27:20, 8.83it/s, v_num=o30c, train/loss=3.770" ] }, { @@ -4633,7 +5234,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 20%[===> ] 341.83M 3.08MB/s eta 8m 18s " + "Epoch 0: 3%| | 446/14932 [00:50<27:19, 8.84it/s, v_num=o30c, train/loss=3.770\r", + "Epoch 0: 3%| | 446/14932 [00:50<27:19, 8.84it/s, v_num=o30c, train/loss=4.220" ] }, { @@ -4641,7 +5243,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 20%[===> ] 342.52M 3.03MB/s eta 8m 18s " + "Epoch 0: 3%| | 447/14932 [00:50<27:17, 8.84it/s, v_num=o30c, train/loss=4.220\r", + "Epoch 0: 3%| | 447/14932 [00:50<27:17, 8.84it/s, v_num=o30c, train/loss=2.860" ] }, { @@ -4649,7 +5252,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 20%[===> ] 343.21M 3.03MB/s eta 8m 18s " + "Epoch 0: 3%| | 448/14932 [00:50<27:20, 8.83it/s, v_num=o30c, train/loss=2.860\r", + "Epoch 0: 3%| | 448/14932 [00:50<27:20, 8.83it/s, v_num=o30c, train/loss=3.020" ] }, { @@ -4657,7 +5261,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 20%[===> ] 343.91M 2.97MB/s eta 8m 16s " + "Epoch 0: 3%| | 449/14932 [00:50<27:18, 8.84it/s, v_num=o30c, train/loss=3.020\r", + "Epoch 0: 3%| | 449/14932 [00:50<27:18, 8.84it/s, v_num=o30c, train/loss=2.750" ] }, { @@ -4665,7 +5270,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 20%[===> ] 344.63M 2.99MB/s eta 8m 16s " + "Epoch 0: 3%| | 450/14932 [00:50<27:17, 8.84it/s, v_num=o30c, train/loss=2.750\r", + "Epoch 0: 3%| | 450/14932 [00:50<27:17, 8.84it/s, v_num=o30c, train/loss=4.190" ] }, { @@ -4673,7 +5279,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 21%[===> ] 345.35M 2.96MB/s eta 8m 16s " + "Epoch 0: 3%| | 451/14932 [00:50<27:16, 8.85it/s, v_num=o30c, train/loss=4.190\r", + "Epoch 0: 3%| | 451/14932 [00:50<27:16, 8.85it/s, v_num=o30c, train/loss=2.720" ] }, { @@ -4681,7 +5288,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 21%[===> ] 346.08M 2.91MB/s eta 8m 16s " + "Epoch 0: 3%| | 452/14932 [00:51<27:14, 8.86it/s, v_num=o30c, train/loss=2.720\r", + "Epoch 0: 3%| | 452/14932 [00:51<27:14, 8.86it/s, v_num=o30c, train/loss=2.250" ] }, { @@ -4689,7 +5297,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 21%[===> ] 346.83M 2.93MB/s eta 8m 16s " + "Epoch 0: 3%| | 453/14932 [00:51<27:13, 8.86it/s, v_num=o30c, train/loss=2.250\r", + "Epoch 0: 3%| | 453/14932 [00:51<27:13, 8.86it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -4697,7 +5306,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 21%[===> ] 347.58M 2.87MB/s eta 8m 14s " + "Epoch 0: 3%| | 454/14932 [00:51<27:12, 8.87it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 3%| | 454/14932 [00:51<27:12, 8.87it/s, v_num=o30c, train/loss=2.730" ] }, { @@ -4705,7 +5315,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 21%[===> ] 348.33M 2.97MB/s eta 8m 14s " + "Epoch 0: 3%| | 455/14932 [00:51<27:10, 8.88it/s, v_num=o30c, train/loss=2.730\r", + "Epoch 0: 3%| | 455/14932 [00:51<27:10, 8.88it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -4713,7 +5324,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 21%[===> ] 349.10M 3.00MB/s eta 8m 14s " + "Epoch 0: 3%| | 456/14932 [00:51<27:09, 8.89it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 3%| | 456/14932 [00:51<27:09, 8.89it/s, v_num=o30c, train/loss=3.230" ] }, { @@ -4721,7 +5333,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 21%[===> ] 349.88M 3.04MB/s eta 8m 14s " + "Epoch 0: 3%| | 457/14932 [00:51<27:07, 8.89it/s, v_num=o30c, train/loss=3.230\r", + "Epoch 0: 3%| | 457/14932 [00:51<27:07, 8.89it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -4729,7 +5342,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 21%[===> ] 350.64M 3.08MB/s eta 8m 14s " + "Epoch 0: 3%| | 458/14932 [00:51<27:06, 8.90it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 3%| | 458/14932 [00:51<27:06, 8.90it/s, v_num=o30c, train/loss=2.840" ] }, { @@ -4737,7 +5351,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 21%[===> ] 351.43M 3.09MB/s eta 8m 11s " + "Epoch 0: 3%| | 459/14932 [00:51<27:04, 8.91it/s, v_num=o30c, train/loss=2.840\r", + "Epoch 0: 3%| | 459/14932 [00:51<27:04, 8.91it/s, v_num=o30c, train/loss=3.950" ] }, { @@ -4745,7 +5360,8 @@ "output_type": "stream", "text": [ "\r", - "pth 21%[===> ] 352.21M 3.19MB/s eta 8m 11s " + "Epoch 0: 3%| | 460/14932 [00:51<27:03, 8.91it/s, v_num=o30c, train/loss=3.950\r", + "Epoch 0: 3%| | 460/14932 [00:51<27:03, 8.91it/s, v_num=o30c, train/loss=2.410" ] }, { @@ -4753,7 +5369,8 @@ "output_type": "stream", "text": [ "\r", - "th 21%[===> ] 352.99M 3.18MB/s eta 8m 11s " + "Epoch 0: 3%| | 461/14932 [00:51<27:01, 8.92it/s, v_num=o30c, train/loss=2.410\r", + "Epoch 0: 3%| | 461/14932 [00:51<27:01, 8.92it/s, v_num=o30c, train/loss=3.860" ] }, { @@ -4761,7 +5378,8 @@ "output_type": "stream", "text": [ "\r", - "h 21%[===> ] 353.71M 3.20MB/s eta 8m 11s " + "Epoch 0: 3%| | 462/14932 [00:51<27:00, 8.93it/s, v_num=o30c, train/loss=3.860\r", + "Epoch 0: 3%| | 462/14932 [00:51<27:00, 8.93it/s, v_num=o30c, train/loss=2.550" ] }, { @@ -4769,7 +5387,8 @@ "output_type": "stream", "text": [ "\r", - " 21%[===> ] 354.10M 3.20MB/s eta 8m 11s " + "Epoch 0: 3%| | 463/14932 [00:51<26:59, 8.94it/s, v_num=o30c, train/loss=2.550\r", + "Epoch 0: 3%| | 463/14932 [00:51<26:59, 8.94it/s, v_num=o30c, train/loss=2.940" ] }, { @@ -4777,7 +5396,8 @@ "output_type": "stream", "text": [ "\r", - " v 21%[===> ] 354.82M 3.21MB/s eta 8m 9s " + "Epoch 0: 3%| | 464/14932 [00:51<26:57, 8.94it/s, v_num=o30c, train/loss=2.940\r", + "Epoch 0: 3%| | 464/14932 [00:51<26:57, 8.94it/s, v_num=o30c, train/loss=2.250" ] }, { @@ -4785,7 +5405,8 @@ "output_type": "stream", "text": [ "\r", - " v5 21%[===> ] 355.54M 3.22MB/s eta 8m 9s " + "Epoch 0: 3%| | 465/14932 [00:51<26:56, 8.95it/s, v_num=o30c, train/loss=2.250\r", + "Epoch 0: 3%| | 465/14932 [00:51<26:56, 8.95it/s, v_num=o30c, train/loss=2.270" ] }, { @@ -4793,7 +5414,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 21%[===> ] 356.25M 3.26MB/s eta 8m 9s " + "Epoch 0: 3%| | 466/14932 [00:52<26:54, 8.96it/s, v_num=o30c, train/loss=2.270\r", + "Epoch 0: 3%| | 466/14932 [00:52<26:54, 8.96it/s, v_num=o30c, train/loss=3.200" ] }, { @@ -4801,7 +5423,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 21%[===> ] 357.04M 3.26MB/s eta 8m 9s " + "Epoch 0: 3%| | 467/14932 [00:52<26:53, 8.96it/s, v_num=o30c, train/loss=3.200\r", + "Epoch 0: 3%| | 467/14932 [00:52<26:53, 8.96it/s, v_num=o30c, train/loss=3.620" ] }, { @@ -4809,7 +5432,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 21%[===> ] 357.83M 3.29MB/s eta 8m 9s " + "Epoch 0: 3%| | 468/14932 [00:52<26:52, 8.97it/s, v_num=o30c, train/loss=3.620\r", + "Epoch 0: 3%| | 468/14932 [00:52<26:52, 8.97it/s, v_num=o30c, train/loss=3.780" ] }, { @@ -4817,7 +5441,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 21%[===> ] 358.63M 3.33MB/s eta 8m 7s " + "Epoch 0: 3%| | 469/14932 [00:52<26:51, 8.98it/s, v_num=o30c, train/loss=3.780\r", + "Epoch 0: 3%| | 469/14932 [00:52<26:51, 8.98it/s, v_num=o30c, train/loss=3.860" ] }, { @@ -4825,7 +5450,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 21%[===> ] 359.43M 3.28MB/s eta 8m 7s " + "Epoch 0: 3%| | 470/14932 [00:52<26:49, 8.98it/s, v_num=o30c, train/loss=3.860\r", + "Epoch 0: 3%| | 470/14932 [00:52<26:49, 8.98it/s, v_num=o30c, train/loss=2.810" ] }, { @@ -4833,7 +5459,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 21%[===> ] 360.22M 3.34MB/s eta 8m 7s " + "Epoch 0: 3%| | 471/14932 [00:52<26:48, 8.99it/s, v_num=o30c, train/loss=2.810\r", + "Epoch 0: 3%| | 471/14932 [00:52<26:48, 8.99it/s, v_num=o30c, train/loss=3.160" ] }, { @@ -4841,7 +5468,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 21%[===> ] 361.02M 3.34MB/s eta 8m 7s " + "Epoch 0: 3%| | 472/14932 [00:52<26:47, 9.00it/s, v_num=o30c, train/loss=3.160\r", + "Epoch 0: 3%| | 472/14932 [00:52<26:47, 9.00it/s, v_num=o30c, train/loss=4.440" ] }, { @@ -4849,7 +5477,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 22%[===> ] 361.82M 3.30MB/s eta 8m 7s " + "Epoch 0: 3%| | 473/14932 [00:52<26:46, 9.00it/s, v_num=o30c, train/loss=4.440\r", + "Epoch 0: 3%| | 473/14932 [00:52<26:46, 9.00it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -4857,7 +5486,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 22%[===> ] 362.60M 3.38MB/s eta 8m 4s " + "Epoch 0: 3%| | 474/14932 [00:52<26:44, 9.01it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 3%| | 474/14932 [00:52<26:44, 9.01it/s, v_num=o30c, train/loss=3.750" ] }, { @@ -4865,7 +5495,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 22%[===> ] 363.39M 3.38MB/s eta 8m 4s " + "Epoch 0: 3%| | 475/14932 [00:52<26:43, 9.02it/s, v_num=o30c, train/loss=3.750\r", + "Epoch 0: 3%| | 475/14932 [00:52<26:43, 9.02it/s, v_num=o30c, train/loss=2.060" ] }, { @@ -4873,7 +5504,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 22%[===> ] 364.19M 3.34MB/s eta 8m 4s " + "Epoch 0: 3%| | 476/14932 [00:52<26:42, 9.02it/s, v_num=o30c, train/loss=2.060\r", + "Epoch 0: 3%| | 476/14932 [00:52<26:42, 9.02it/s, v_num=o30c, train/loss=2.500" ] }, { @@ -4881,7 +5513,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 22%[===> ] 364.99M 3.39MB/s eta 8m 4s " + "Epoch 0: 3%| | 477/14932 [00:52<26:41, 9.03it/s, v_num=o30c, train/loss=2.500\r", + "Epoch 0: 3%| | 477/14932 [00:52<26:41, 9.03it/s, v_num=o30c, train/loss=3.250" ] }, { @@ -4889,7 +5522,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 22%[===> ] 365.79M 3.39MB/s eta 8m 4s " + "Epoch 0: 3%| | 478/14932 [00:52<26:39, 9.03it/s, v_num=o30c, train/loss=3.250\r", + "Epoch 0: 3%| | 478/14932 [00:52<26:39, 9.03it/s, v_num=o30c, train/loss=1.810" ] }, { @@ -4897,7 +5531,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 22%[===> ] 366.58M 3.33MB/s eta 8m 1s " + "Epoch 0: 3%| | 479/14932 [00:52<26:38, 9.04it/s, v_num=o30c, train/loss=1.810\r", + "Epoch 0: 3%| | 479/14932 [00:52<26:38, 9.04it/s, v_num=o30c, train/loss=3.250" ] }, { @@ -4905,7 +5540,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 22%[===> ] 367.38M 3.38MB/s eta 8m 1s " + "Epoch 0: 3%| | 480/14932 [00:53<26:40, 9.03it/s, v_num=o30c, train/loss=3.250" ] }, { @@ -4913,7 +5548,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 22%[===> ] 368.18M 3.37MB/s eta 8m 1s " + "Epoch 0: 3%| | 480/14932 [00:53<26:40, 9.03it/s, v_num=o30c, train/loss=1.940" ] }, { @@ -4921,7 +5556,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 22%[===> ] 368.97M 3.41MB/s eta 8m 1s " + "Epoch 0: 3%| | 481/14932 [00:53<26:39, 9.04it/s, v_num=o30c, train/loss=1.940\r", + "Epoch 0: 3%| | 481/14932 [00:53<26:39, 9.04it/s, v_num=o30c, train/loss=4.250" ] }, { @@ -4929,7 +5565,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 22%[===> ] 369.79M 3.38MB/s eta 8m 1s " + "Epoch 0: 3%| | 482/14932 [00:53<26:37, 9.04it/s, v_num=o30c, train/loss=4.250\r", + "Epoch 0: 3%| | 482/14932 [00:53<26:37, 9.04it/s, v_num=o30c, train/loss=3.190" ] }, { @@ -4937,7 +5574,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 22%[===> ] 370.58M 3.35MB/s eta 7m 59s " + "Epoch 0: 3%| | 483/14932 [00:53<26:36, 9.05it/s, v_num=o30c, train/loss=3.190\r", + "Epoch 0: 3%| | 483/14932 [00:53<26:36, 9.05it/s, v_num=o30c, train/loss=3.440" ] }, { @@ -4945,7 +5583,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 22%[===> ] 371.39M 3.39MB/s eta 7m 59s " + "Epoch 0: 3%| | 484/14932 [00:53<26:35, 9.06it/s, v_num=o30c, train/loss=3.440\r", + "Epoch 0: 3%| | 484/14932 [00:53<26:35, 9.06it/s, v_num=o30c, train/loss=3.720" ] }, { @@ -4953,7 +5592,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 22%[===> ] 372.19M 3.41MB/s eta 7m 59s " + "Epoch 0: 3%| | 485/14932 [00:53<26:34, 9.06it/s, v_num=o30c, train/loss=3.720\r", + "Epoch 0: 3%| | 485/14932 [00:53<26:34, 9.06it/s, v_num=o30c, train/loss=2.560" ] }, { @@ -4961,7 +5601,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 22%[===> ] 373.00M 3.43MB/s eta 7m 59s " + "Epoch 0: 3%| | 486/14932 [00:53<26:36, 9.05it/s, v_num=o30c, train/loss=2.560\r", + "Epoch 0: 3%| | 486/14932 [00:53<26:36, 9.05it/s, v_num=o30c, train/loss=3.860" ] }, { @@ -4969,7 +5610,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 22%[===> ] 373.83M 3.42MB/s eta 7m 59s " + "Epoch 0: 3%| | 487/14932 [00:53<26:37, 9.04it/s, v_num=o30c, train/loss=3.860\r", + "Epoch 0: 3%| | 487/14932 [00:53<26:37, 9.04it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -4977,7 +5619,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 22%[===> ] 374.64M 3.41MB/s eta 7m 56s " + "Epoch 0: 3%| | 488/14932 [00:53<26:35, 9.05it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 3%| | 488/14932 [00:53<26:35, 9.05it/s, v_num=o30c, train/loss=3.110" ] }, { @@ -4985,7 +5628,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 22%[===> ] 375.49M 3.44MB/s eta 7m 56s " + "Epoch 0: 3%| | 489/14932 [00:53<26:34, 9.06it/s, v_num=o30c, train/loss=3.110\r", + "Epoch 0: 3%| | 489/14932 [00:54<26:34, 9.06it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -4993,7 +5637,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 22%[===> ] 376.32M 3.45MB/s eta 7m 56s " + "Epoch 0: 3%| | 490/14932 [00:54<26:35, 9.05it/s, v_num=o30c, train/loss=3.480\r", + "Epoch 0: 3%| | 490/14932 [00:54<26:35, 9.05it/s, v_num=o30c, train/loss=4.410" ] }, { @@ -5001,7 +5646,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 22%[===> ] 377.16M 3.44MB/s eta 7m 56s " + "Epoch 0: 3%| | 491/14932 [00:54<26:34, 9.06it/s, v_num=o30c, train/loss=4.410\r", + "Epoch 0: 3%| | 491/14932 [00:54<26:34, 9.06it/s, v_num=o30c, train/loss=2.800" ] }, { @@ -5009,7 +5655,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 23%[===> ] 378.00M 3.50MB/s eta 7m 56s " + "Epoch 0: 3%| | 492/14932 [00:54<26:33, 9.06it/s, v_num=o30c, train/loss=2.800\r", + "Epoch 0: 3%| | 492/14932 [00:54<26:33, 9.06it/s, v_num=o30c, train/loss=3.830" ] }, { @@ -5017,7 +5664,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 23%[===> ] 378.88M 3.47MB/s eta 7m 53s " + "Epoch 0: 3%| | 493/14932 [00:54<26:32, 9.07it/s, v_num=o30c, train/loss=3.830\r", + "Epoch 0: 3%| | 493/14932 [00:54<26:32, 9.07it/s, v_num=o30c, train/loss=4.280" ] }, { @@ -5025,7 +5673,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 23%[===> ] 379.74M 3.47MB/s eta 7m 53s " + "Epoch 0: 3%| | 494/14932 [00:54<26:31, 9.07it/s, v_num=o30c, train/loss=4.280\r", + "Epoch 0: 3%| | 494/14932 [00:54<26:31, 9.07it/s, v_num=o30c, train/loss=3.020" ] }, { @@ -5033,7 +5682,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 23%[===> ] 380.61M 3.54MB/s eta 7m 53s " + "Epoch 0: 3%| | 495/14932 [00:54<26:30, 9.08it/s, v_num=o30c, train/loss=3.020\r", + "Epoch 0: 3%| | 495/14932 [00:54<26:30, 9.08it/s, v_num=o30c, train/loss=3.360" ] }, { @@ -5041,7 +5691,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 23%[===> ] 381.52M 3.53MB/s eta 7m 53s " + "Epoch 0: 3%| | 496/14932 [00:54<26:28, 9.09it/s, v_num=o30c, train/loss=3.360\r", + "Epoch 0: 3%| | 496/14932 [00:54<26:28, 9.09it/s, v_num=o30c, train/loss=3.120" ] }, { @@ -5049,7 +5700,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 23%[===> ] 382.43M 3.52MB/s eta 7m 53s " + "Epoch 0: 3%| | 497/14932 [00:54<26:27, 9.09it/s, v_num=o30c, train/loss=3.120\r", + "Epoch 0: 3%| | 497/14932 [00:54<26:27, 9.09it/s, v_num=o30c, train/loss=3.500" ] }, { @@ -5057,7 +5709,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 23%[===> ] 383.33M 3.61MB/s eta 7m 50s " + "Epoch 0: 3%| | 498/14932 [00:54<26:26, 9.10it/s, v_num=o30c, train/loss=3.500\r", + "Epoch 0: 3%| | 498/14932 [00:54<26:26, 9.10it/s, v_num=o30c, train/loss=3.330" ] }, { @@ -5065,7 +5718,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 23%[===> ] 384.27M 3.61MB/s eta 7m 50s " + "Epoch 0: 3%| | 499/14932 [00:54<26:25, 9.10it/s, v_num=o30c, train/loss=3.330\r", + "Epoch 0: 3%| | 499/14932 [00:54<26:25, 9.10it/s, v_num=o30c, train/loss=3.750" ] }, { @@ -5073,7 +5727,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 23%[===> ] 385.11M 3.45MB/s eta 7m 50s " + "Epoch 0: 3%| | 500/14932 [00:54<26:24, 9.11it/s, v_num=o30c, train/loss=3.750\r", + "Epoch 0: 3%| | 500/14932 [00:54<26:24, 9.11it/s, v_num=o30c, train/loss=1.750" ] }, { @@ -5081,7 +5736,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 23%[===> ] 386.86M 3.65MB/s eta 7m 50s " + "Epoch 0: 3%| | 501/14932 [00:55<26:24, 9.11it/s, v_num=o30c, train/loss=1.750\r", + "Epoch 0: 3%| | 501/14932 [00:55<26:24, 9.11it/s, v_num=o30c, train/loss=4.530" ] }, { @@ -5089,7 +5745,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 23%[===> ] 387.57M 3.59MB/s eta 7m 47s " + "Epoch 0: 3%| | 502/14932 [00:55<26:23, 9.11it/s, v_num=o30c, train/loss=4.530\r", + "Epoch 0: 3%| | 502/14932 [00:55<26:23, 9.11it/s, v_num=o30c, train/loss=3.610" ] }, { @@ -5097,7 +5754,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 23%[===> ] 388.29M 3.64MB/s eta 7m 47s " + "Epoch 0: 3%| | 503/14932 [00:55<26:22, 9.12it/s, v_num=o30c, train/loss=3.610\r", + "Epoch 0: 3%| | 503/14932 [00:55<26:22, 9.12it/s, v_num=o30c, train/loss=3.520" ] }, { @@ -5105,7 +5763,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 23%[===> ] 389.04M 3.61MB/s eta 7m 47s " + "Epoch 0: 3%| | 504/14932 [00:55<26:21, 9.13it/s, v_num=o30c, train/loss=3.520\r", + "Epoch 0: 3%| | 504/14932 [00:55<26:21, 9.13it/s, v_num=o30c, train/loss=2.670" ] }, { @@ -5113,7 +5772,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 23%[===> ] 389.80M 3.55MB/s eta 7m 47s " + "Epoch 0: 3%| | 505/14932 [00:55<26:19, 9.13it/s, v_num=o30c, train/loss=2.670\r", + "Epoch 0: 3%| | 505/14932 [00:55<26:19, 9.13it/s, v_num=o30c, train/loss=1.460" ] }, { @@ -5121,7 +5781,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 23%[===> ] 390.58M 3.58MB/s eta 7m 47s " + "Epoch 0: 3%| | 506/14932 [00:55<26:18, 9.14it/s, v_num=o30c, train/loss=1.460\r", + "Epoch 0: 3%| | 506/14932 [00:55<26:18, 9.14it/s, v_num=o30c, train/loss=3.160" ] }, { @@ -5129,7 +5790,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 23%[===> ] 391.39M 3.55MB/s eta 7m 45s " + "Epoch 0: 3%| | 507/14932 [00:55<26:17, 9.14it/s, v_num=o30c, train/loss=3.160\r", + "Epoch 0: 3%| | 507/14932 [00:55<26:17, 9.14it/s, v_num=o30c, train/loss=3.590" ] }, { @@ -5137,7 +5799,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 23%[===> ] 392.22M 3.50MB/s eta 7m 45s " + "Epoch 0: 3%| | 508/14932 [00:55<26:16, 9.15it/s, v_num=o30c, train/loss=3.590\r", + "Epoch 0: 3%| | 508/14932 [00:55<26:16, 9.15it/s, v_num=o30c, train/loss=2.330" ] }, { @@ -5145,7 +5808,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 23%[===> ] 393.05M 3.55MB/s eta 7m 45s " + "Epoch 0: 3%| | 509/14932 [00:55<26:15, 9.16it/s, v_num=o30c, train/loss=2.330\r", + "Epoch 0: 3%| | 509/14932 [00:55<26:15, 9.16it/s, v_num=o30c, train/loss=3.310" ] }, { @@ -5153,7 +5817,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 23%[===> ] 393.91M 3.51MB/s eta 7m 45s " + "Epoch 0: 3%| | 510/14932 [00:55<26:13, 9.16it/s, v_num=o30c, train/loss=3.310\r", + "Epoch 0: 3%| | 510/14932 [00:55<26:13, 9.16it/s, v_num=o30c, train/loss=2.270" ] }, { @@ -5161,7 +5826,8 @@ "output_type": "stream", "text": [ "\r", - "pth 24%[===> ] 394.77M 3.55MB/s eta 7m 45s " + "Epoch 0: 3%| | 511/14932 [00:55<26:12, 9.17it/s, v_num=o30c, train/loss=2.270\r", + "Epoch 0: 3%| | 511/14932 [00:55<26:12, 9.17it/s, v_num=o30c, train/loss=3.340" ] }, { @@ -5169,7 +5835,8 @@ "output_type": "stream", "text": [ "\r", - "th 24%[===> ] 395.64M 3.53MB/s eta 7m 42s " + "Epoch 0: 3%| | 512/14932 [00:55<26:15, 9.16it/s, v_num=o30c, train/loss=3.340\r", + "Epoch 0: 3%| | 512/14932 [00:55<26:15, 9.16it/s, v_num=o30c, train/loss=3.770" ] }, { @@ -5177,7 +5844,8 @@ "output_type": "stream", "text": [ "\r", - "h 24%[===> ] 396.54M 3.44MB/s eta 7m 42s " + "Epoch 0: 3%| | 513/14932 [00:55<26:13, 9.16it/s, v_num=o30c, train/loss=3.770\r", + "Epoch 0: 3%| | 513/14932 [00:55<26:13, 9.16it/s, v_num=o30c, train/loss=2.800" ] }, { @@ -5185,7 +5853,8 @@ "output_type": "stream", "text": [ "\r", - " 24%[===> ] 397.43M 3.52MB/s eta 7m 42s " + "Epoch 0: 3%| | 514/14932 [00:56<26:12, 9.17it/s, v_num=o30c, train/loss=2.800\r", + "Epoch 0: 3%| | 514/14932 [00:56<26:12, 9.17it/s, v_num=o30c, train/loss=3.840" ] }, { @@ -5193,7 +5862,8 @@ "output_type": "stream", "text": [ "\r", - " v 24%[===> ] 398.35M 3.43MB/s eta 7m 42s " + "Epoch 0: 3%| | 515/14932 [00:56<26:11, 9.17it/s, v_num=o30c, train/loss=3.840\r", + "Epoch 0: 3%| | 515/14932 [00:56<26:11, 9.17it/s, v_num=o30c, train/loss=3.520" ] }, { @@ -5201,7 +5871,8 @@ "output_type": "stream", "text": [ "\r", - " v5 24%[===> ] 399.25M 3.56MB/s eta 7m 42s " + "Epoch 0: 3%| | 516/14932 [00:56<26:10, 9.18it/s, v_num=o30c, train/loss=3.520\r", + "Epoch 0: 3%| | 516/14932 [00:56<26:10, 9.18it/s, v_num=o30c, train/loss=3.780" ] }, { @@ -5209,7 +5880,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 24%[===> ] 400.18M 3.60MB/s eta 7m 38s " + "Epoch 0: 3%| | 517/14932 [00:56<26:09, 9.19it/s, v_num=o30c, train/loss=3.780\r", + "Epoch 0: 3%| | 517/14932 [00:56<26:09, 9.19it/s, v_num=o30c, train/loss=1.800" ] }, { @@ -5217,7 +5889,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 24%[===> ] 401.11M 3.62MB/s eta 7m 38s " + "Epoch 0: 3%| | 518/14932 [00:56<26:08, 9.19it/s, v_num=o30c, train/loss=1.800\r", + "Epoch 0: 3%| | 518/14932 [00:56<26:08, 9.19it/s, v_num=o30c, train/loss=3.000" ] }, { @@ -5225,7 +5898,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 24%[===> ] 402.05M 3.73MB/s eta 7m 38s " + "Epoch 0: 3%| | 519/14932 [00:56<26:06, 9.20it/s, v_num=o30c, train/loss=3.000\r", + "Epoch 0: 3%| | 519/14932 [00:56<26:06, 9.20it/s, v_num=o30c, train/loss=3.250" ] }, { @@ -5233,7 +5907,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 24%[===> ] 402.99M 3.76MB/s eta 7m 38s " + "Epoch 0: 3%| | 520/14932 [00:56<26:05, 9.20it/s, v_num=o30c, train/loss=3.250\r", + "Epoch 0: 3%| | 520/14932 [00:56<26:05, 9.20it/s, v_num=o30c, train/loss=3.230" ] }, { @@ -5241,7 +5916,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 24%[===> ] 403.94M 3.76MB/s eta 7m 38s " + "Epoch 0: 3%| | 521/14932 [00:56<26:04, 9.21it/s, v_num=o30c, train/loss=3.230\r", + "Epoch 0: 3%| | 521/14932 [00:56<26:04, 9.21it/s, v_num=o30c, train/loss=3.800" ] }, { @@ -5249,7 +5925,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 24%[===> ] 404.04M 3.50MB/s eta 7m 37s " + "Epoch 0: 3%| | 522/14932 [00:56<26:03, 9.22it/s, v_num=o30c, train/loss=3.800\r", + "Epoch 0: 3%| | 522/14932 [00:56<26:03, 9.22it/s, v_num=o30c, train/loss=1.240" ] }, { @@ -5257,7 +5934,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 24%[===> ] 405.74M 3.75MB/s eta 7m 37s " + "Epoch 0: 4%| | 523/14932 [00:56<26:02, 9.22it/s, v_num=o30c, train/loss=1.240\r", + "Epoch 0: 4%| | 523/14932 [00:56<26:02, 9.22it/s, v_num=o30c, train/loss=2.590" ] }, { @@ -5265,7 +5943,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 24%[===> ] 406.36M 3.68MB/s eta 7m 37s " + "Epoch 0: 4%| | 524/14932 [00:56<26:01, 9.23it/s, v_num=o30c, train/loss=2.590\r", + "Epoch 0: 4%| | 524/14932 [00:56<26:01, 9.23it/s, v_num=o30c, train/loss=3.160" ] }, { @@ -5273,7 +5952,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 24%[===> ] 406.99M 3.72MB/s eta 7m 37s " + "Epoch 0: 4%| | 525/14932 [00:57<26:05, 9.20it/s, v_num=o30c, train/loss=3.160\r", + "Epoch 0: 4%| | 525/14932 [00:57<26:05, 9.20it/s, v_num=o30c, train/loss=4.440" ] }, { @@ -5281,7 +5961,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 24%[===> ] 407.68M 3.65MB/s eta 7m 37s " + "Epoch 0: 4%| | 526/14932 [00:57<26:04, 9.21it/s, v_num=o30c, train/loss=4.440\r", + "Epoch 0: 4%| | 526/14932 [00:57<26:04, 9.21it/s, v_num=o30c, train/loss=4.470" ] }, { @@ -5289,7 +5970,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 24%[===> ] 408.19M 3.35MB/s eta 7m 34s " + "Epoch 0: 4%| | 527/14932 [00:57<26:03, 9.22it/s, v_num=o30c, train/loss=4.470\r", + "Epoch 0: 4%| | 527/14932 [00:57<26:03, 9.22it/s, v_num=o30c, train/loss=4.000" ] }, { @@ -5297,7 +5979,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 24%[===> ] 409.57M 3.45MB/s eta 7m 34s " + "Epoch 0: 4%| | 528/14932 [00:57<26:01, 9.22it/s, v_num=o30c, train/loss=4.000\r", + "Epoch 0: 4%| | 528/14932 [00:57<26:01, 9.22it/s, v_num=o30c, train/loss=2.970" ] }, { @@ -5305,7 +5988,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 24%[===> ] 410.08M 3.31MB/s eta 7m 34s " + "Epoch 0: 4%| | 529/14932 [00:57<26:00, 9.23it/s, v_num=o30c, train/loss=2.970\r", + "Epoch 0: 4%| | 529/14932 [00:57<26:00, 9.23it/s, v_num=o30c, train/loss=2.780" ] }, { @@ -5313,7 +5997,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 25%[====> ] 410.61M 3.27MB/s eta 7m 34s " + "Epoch 0: 4%| | 530/14932 [00:57<25:59, 9.24it/s, v_num=o30c, train/loss=2.780\r", + "Epoch 0: 4%| | 530/14932 [00:57<25:59, 9.24it/s, v_num=o30c, train/loss=3.230" ] }, { @@ -5321,7 +6006,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 25%[====> ] 411.16M 3.14MB/s eta 7m 34s " + "Epoch 0: 4%| | 531/14932 [00:57<25:58, 9.24it/s, v_num=o30c, train/loss=3.230\r", + "Epoch 0: 4%| | 531/14932 [00:57<25:58, 9.24it/s, v_num=o30c, train/loss=3.920" ] }, { @@ -5329,7 +6015,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 25%[====> ] 411.71M 3.12MB/s eta 7m 33s " + "Epoch 0: 4%| | 532/14932 [00:57<25:56, 9.25it/s, v_num=o30c, train/loss=3.920\r", + "Epoch 0: 4%| | 532/14932 [00:57<25:56, 9.25it/s, v_num=o30c, train/loss=3.890" ] }, { @@ -5337,7 +6024,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 25%[====> ] 412.27M 2.96MB/s eta 7m 33s " + "Epoch 0: 4%| | 533/14932 [00:57<25:55, 9.25it/s, v_num=o30c, train/loss=3.890\r", + "Epoch 0: 4%| | 533/14932 [00:57<25:55, 9.25it/s, v_num=o30c, train/loss=2.890" ] }, { @@ -5345,7 +6033,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 25%[====> ] 412.83M 2.85MB/s eta 7m 33s " + "Epoch 0: 4%| | 534/14932 [00:57<25:54, 9.26it/s, v_num=o30c, train/loss=2.890\r", + "Epoch 0: 4%| | 534/14932 [00:57<25:54, 9.26it/s, v_num=o30c, train/loss=3.830" ] }, { @@ -5353,7 +6042,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 25%[====> ] 413.41M 2.78MB/s eta 7m 33s " + "Epoch 0: 4%| | 535/14932 [00:57<25:53, 9.27it/s, v_num=o30c, train/loss=3.830\r", + "Epoch 0: 4%| | 535/14932 [00:57<25:53, 9.27it/s, v_num=o30c, train/loss=3.830" ] }, { @@ -5361,7 +6051,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 25%[====> ] 413.99M 2.67MB/s eta 7m 33s " + "Epoch 0: 4%| | 536/14932 [00:57<25:52, 9.27it/s, v_num=o30c, train/loss=3.830\r", + "Epoch 0: 4%| | 536/14932 [00:57<25:52, 9.27it/s, v_num=o30c, train/loss=1.140" ] }, { @@ -5369,7 +6060,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 25%[====> ] 414.58M 2.55MB/s eta 7m 32s " + "Epoch 0: 4%| | 537/14932 [00:57<25:51, 9.28it/s, v_num=o30c, train/loss=1.140\r", + "Epoch 0: 4%| | 537/14932 [00:57<25:51, 9.28it/s, v_num=o30c, train/loss=3.310" ] }, { @@ -5377,7 +6069,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 25%[====> ] 415.18M 2.50MB/s eta 7m 32s " + "Epoch 0: 4%| | 538/14932 [00:57<25:50, 9.28it/s, v_num=o30c, train/loss=3.310\r", + "Epoch 0: 4%| | 538/14932 [00:57<25:50, 9.28it/s, v_num=o30c, train/loss=2.360" ] }, { @@ -5385,7 +6078,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 25%[====> ] 415.77M 2.53MB/s eta 7m 32s " + "Epoch 0: 4%| | 539/14932 [00:58<25:51, 9.28it/s, v_num=o30c, train/loss=2.360\r", + "Epoch 0: 4%| | 539/14932 [00:58<25:51, 9.28it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -5393,7 +6087,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 25%[====> ] 415.79M 2.29MB/s eta 7m 32s " + "Epoch 0: 4%| | 540/14932 [00:58<25:50, 9.28it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 4%| | 540/14932 [00:58<25:50, 9.28it/s, v_num=o30c, train/loss=2.660" ] }, { @@ -5401,7 +6096,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 25%[====> ] 416.86M 2.39MB/s eta 7m 32s " + "Epoch 0: 4%| | 541/14932 [00:58<25:49, 9.29it/s, v_num=o30c, train/loss=2.660\r", + "Epoch 0: 4%| | 541/14932 [00:58<25:49, 9.29it/s, v_num=o30c, train/loss=3.530" ] }, { @@ -5409,7 +6105,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 25%[====> ] 417.25M 2.39MB/s eta 7m 32s " + "Epoch 0: 4%| | 542/14932 [00:58<25:48, 9.30it/s, v_num=o30c, train/loss=3.530\r", + "Epoch 0: 4%| | 542/14932 [00:58<25:48, 9.30it/s, v_num=o30c, train/loss=3.000" ] }, { @@ -5417,7 +6114,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 25%[====> ] 417.69M 2.27MB/s eta 7m 32s " + "Epoch 0: 4%| | 543/14932 [00:58<25:47, 9.30it/s, v_num=o30c, train/loss=3.000\r", + "Epoch 0: 4%| | 543/14932 [00:58<25:47, 9.30it/s, v_num=o30c, train/loss=2.470" ] }, { @@ -5425,7 +6123,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 25%[====> ] 418.13M 2.30MB/s eta 7m 32s " + "Epoch 0: 4%| | 544/14932 [00:58<25:49, 9.29it/s, v_num=o30c, train/loss=2.470\r", + "Epoch 0: 4%| | 544/14932 [00:58<25:49, 9.29it/s, v_num=o30c, train/loss=3.500" ] }, { @@ -5433,7 +6132,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 25%[====> ] 418.60M 2.27MB/s eta 7m 32s " + "Epoch 0: 4%| | 545/14932 [00:58<25:48, 9.29it/s, v_num=o30c, train/loss=3.500\r", + "Epoch 0: 4%| | 545/14932 [00:58<25:48, 9.29it/s, v_num=o30c, train/loss=3.160" ] }, { @@ -5441,7 +6141,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 25%[====> ] 419.05M 2.22MB/s eta 7m 32s " + "Epoch 0: 4%| | 546/14932 [00:58<25:47, 9.30it/s, v_num=o30c, train/loss=3.160\r", + "Epoch 0: 4%| | 546/14932 [00:58<25:47, 9.30it/s, v_num=o30c, train/loss=2.420" ] }, { @@ -5449,7 +6150,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 25%[====> ] 419.54M 2.24MB/s eta 7m 32s " + "Epoch 0: 4%| | 547/14932 [00:58<25:46, 9.30it/s, v_num=o30c, train/loss=2.420\r", + "Epoch 0: 4%| | 547/14932 [00:58<25:46, 9.30it/s, v_num=o30c, train/loss=2.750" ] }, { @@ -5457,7 +6159,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 25%[====> ] 420.02M 2.18MB/s eta 7m 32s " + "Epoch 0: 4%| | 548/14932 [00:58<25:45, 9.31it/s, v_num=o30c, train/loss=2.750\r", + "Epoch 0: 4%| | 548/14932 [00:58<25:45, 9.31it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -5465,7 +6168,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 25%[====> ] 420.50M 2.20MB/s eta 7m 32s " + "Epoch 0: 4%| | 549/14932 [00:58<25:44, 9.31it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 4%| | 549/14932 [00:58<25:44, 9.31it/s, v_num=o30c, train/loss=3.050" ] }, { @@ -5473,7 +6177,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 25%[====> ] 420.99M 2.14MB/s eta 7m 32s " + "Epoch 0: 4%| | 550/14932 [00:59<25:43, 9.32it/s, v_num=o30c, train/loss=3.050\r", + "Epoch 0: 4%| | 550/14932 [00:59<25:43, 9.32it/s, v_num=o30c, train/loss=3.280" ] }, { @@ -5481,7 +6186,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 25%[====> ] 421.47M 2.09MB/s eta 7m 32s " + "Epoch 0: 4%| | 551/14932 [00:59<25:43, 9.31it/s, v_num=o30c, train/loss=3.280\r", + "Epoch 0: 4%| | 551/14932 [00:59<25:43, 9.31it/s, v_num=o30c, train/loss=3.140" ] }, { @@ -5489,7 +6195,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 25%[====> ] 421.96M 2.10MB/s eta 7m 32s " + "Epoch 0: 4%| | 552/14932 [00:59<25:42, 9.32it/s, v_num=o30c, train/loss=3.140\r", + "Epoch 0: 4%| | 552/14932 [00:59<25:42, 9.32it/s, v_num=o30c, train/loss=3.410" ] }, { @@ -5497,7 +6204,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 25%[====> ] 422.43M 2.07MB/s eta 7m 32s " + "Epoch 0: 4%| | 553/14932 [00:59<25:41, 9.33it/s, v_num=o30c, train/loss=3.410\r", + "Epoch 0: 4%| | 553/14932 [00:59<25:41, 9.33it/s, v_num=o30c, train/loss=3.230" ] }, { @@ -5505,7 +6213,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 25%[====> ] 422.91M 2.26MB/s eta 7m 32s " + "Epoch 0: 4%| | 554/14932 [00:59<25:40, 9.33it/s, v_num=o30c, train/loss=3.230\r", + "Epoch 0: 4%| | 554/14932 [00:59<25:40, 9.33it/s, v_num=o30c, train/loss=2.920" ] }, { @@ -5513,7 +6222,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 25%[====> ] 423.35M 2.04MB/s eta 7m 32s " + "Epoch 0: 4%| | 555/14932 [00:59<25:39, 9.34it/s, v_num=o30c, train/loss=2.920\r", + "Epoch 0: 4%| | 555/14932 [00:59<25:39, 9.34it/s, v_num=o30c, train/loss=3.110" ] }, { @@ -5521,7 +6231,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 25%[====> ] 423.66M 2.00MB/s eta 7m 32s " + "Epoch 0: 4%| | 556/14932 [00:59<25:39, 9.34it/s, v_num=o30c, train/loss=3.110\r", + "Epoch 0: 4%| | 556/14932 [00:59<25:39, 9.34it/s, v_num=o30c, train/loss=4.590" ] }, { @@ -5529,7 +6240,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 25%[====> ] 424.10M 2.04MB/s eta 7m 32s " + "Epoch 0: 4%| | 557/14932 [00:59<25:38, 9.35it/s, v_num=o30c, train/loss=4.590\r", + "Epoch 0: 4%| | 557/14932 [00:59<25:38, 9.35it/s, v_num=o30c, train/loss=4.410" ] }, { @@ -5537,7 +6249,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 25%[====> ] 424.61M 2.03MB/s eta 7m 32s " + "Epoch 0: 4%| | 558/14932 [00:59<25:37, 9.35it/s, v_num=o30c, train/loss=4.410\r", + "Epoch 0: 4%| | 558/14932 [00:59<25:37, 9.35it/s, v_num=o30c, train/loss=3.250" ] }, { @@ -5545,7 +6258,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 25%[====> ] 425.11M 2.07MB/s eta 7m 32s " + "Epoch 0: 4%| | 559/14932 [00:59<25:36, 9.36it/s, v_num=o30c, train/loss=3.250\r", + "Epoch 0: 4%| | 559/14932 [00:59<25:36, 9.36it/s, v_num=o30c, train/loss=3.470" ] }, { @@ -5553,7 +6267,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 25%[====> ] 425.63M 2.06MB/s eta 7m 32s " + "Epoch 0: 4%| | 560/14932 [00:59<25:35, 9.36it/s, v_num=o30c, train/loss=3.470\r", + "Epoch 0: 4%| | 560/14932 [00:59<25:35, 9.36it/s, v_num=o30c, train/loss=1.360" ] }, { @@ -5561,7 +6276,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 25%[====> ] 426.14M 2.10MB/s eta 7m 31s " + "Epoch 0: 4%| | 561/14932 [00:59<25:33, 9.37it/s, v_num=o30c, train/loss=1.360\r", + "Epoch 0: 4%| | 561/14932 [00:59<25:33, 9.37it/s, v_num=o30c, train/loss=1.990" ] }, { @@ -5569,7 +6285,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 25%[====> ] 426.66M 2.09MB/s eta 7m 31s " + "Epoch 0: 4%| | 562/14932 [00:59<25:32, 9.37it/s, v_num=o30c, train/loss=1.990\r", + "Epoch 0: 4%| | 562/14932 [00:59<25:32, 9.37it/s, v_num=o30c, train/loss=2.310" ] }, { @@ -5577,7 +6294,8 @@ "output_type": "stream", "text": [ "\r", - "pth 26%[====> ] 427.16M 2.12MB/s eta 7m 31s " + "Epoch 0: 4%| | 563/14932 [01:00<25:31, 9.38it/s, v_num=o30c, train/loss=2.310\r", + "Epoch 0: 4%| | 563/14932 [01:00<25:31, 9.38it/s, v_num=o30c, train/loss=2.330" ] }, { @@ -5585,7 +6303,8 @@ "output_type": "stream", "text": [ "\r", - "th 26%[====> ] 427.68M 2.10MB/s eta 7m 31s " + "Epoch 0: 4%| | 564/14932 [01:00<25:30, 9.39it/s, v_num=o30c, train/loss=2.330\r", + "Epoch 0: 4%| | 564/14932 [01:00<25:30, 9.39it/s, v_num=o30c, train/loss=3.560" ] }, { @@ -5593,7 +6312,8 @@ "output_type": "stream", "text": [ "\r", - "h 26%[====> ] 428.19M 2.13MB/s eta 7m 31s " + "Epoch 0: 4%| | 565/14932 [01:00<25:30, 9.39it/s, v_num=o30c, train/loss=3.560\r", + "Epoch 0: 4%| | 565/14932 [01:00<25:30, 9.39it/s, v_num=o30c, train/loss=2.620" ] }, { @@ -5601,7 +6321,8 @@ "output_type": "stream", "text": [ "\r", - " 26%[====> ] 428.71M 2.11MB/s eta 7m 31s " + "Epoch 0: 4%| | 566/14932 [01:00<25:29, 9.39it/s, v_num=o30c, train/loss=2.620\r", + "Epoch 0: 4%| | 566/14932 [01:00<25:29, 9.39it/s, v_num=o30c, train/loss=3.440" ] }, { @@ -5609,7 +6330,8 @@ "output_type": "stream", "text": [ "\r", - " v 26%[====> ] 429.22M 2.14MB/s eta 7m 31s " + "Epoch 0: 4%| | 567/14932 [01:00<25:28, 9.40it/s, v_num=o30c, train/loss=3.440\r", + "Epoch 0: 4%| | 567/14932 [01:00<25:28, 9.40it/s, v_num=o30c, train/loss=3.020" ] }, { @@ -5617,7 +6339,8 @@ "output_type": "stream", "text": [ "\r", - " v5 26%[====> ] 429.72M 2.12MB/s eta 7m 31s " + "Epoch 0: 4%| | 568/14932 [01:00<25:30, 9.39it/s, v_num=o30c, train/loss=3.020\r", + "Epoch 0: 4%| | 568/14932 [01:00<25:30, 9.39it/s, v_num=o30c, train/loss=3.360" ] }, { @@ -5625,7 +6348,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 26%[====> ] 430.24M 2.15MB/s eta 7m 31s " + "Epoch 0: 4%| | 569/14932 [01:00<25:29, 9.39it/s, v_num=o30c, train/loss=3.360\r", + "Epoch 0: 4%| | 569/14932 [01:00<25:29, 9.39it/s, v_num=o30c, train/loss=3.250" ] }, { @@ -5633,7 +6357,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 26%[====> ] 430.75M 2.12MB/s eta 7m 31s " + "Epoch 0: 4%| | 570/14932 [01:00<25:28, 9.40it/s, v_num=o30c, train/loss=3.250\r", + "Epoch 0: 4%| | 570/14932 [01:00<25:28, 9.40it/s, v_num=o30c, train/loss=3.890" ] }, { @@ -5641,7 +6366,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 26%[====> ] 431.27M 2.15MB/s eta 7m 31s " + "Epoch 0: 4%| | 571/14932 [01:00<25:32, 9.37it/s, v_num=o30c, train/loss=3.890\r", + "Epoch 0: 4%| | 571/14932 [01:00<25:32, 9.37it/s, v_num=o30c, train/loss=3.860" ] }, { @@ -5649,7 +6375,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 26%[====> ] 431.79M 2.13MB/s eta 7m 31s " + "Epoch 0: 4%| | 572/14932 [01:01<25:31, 9.37it/s, v_num=o30c, train/loss=3.860\r", + "Epoch 0: 4%| | 572/14932 [01:01<25:31, 9.37it/s, v_num=o30c, train/loss=1.440" ] }, { @@ -5657,7 +6384,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 26%[====> ] 432.30M 2.15MB/s eta 7m 31s " + "Epoch 0: 4%| | 573/14932 [01:01<25:30, 9.38it/s, v_num=o30c, train/loss=1.440\r", + "Epoch 0: 4%| | 573/14932 [01:01<25:30, 9.38it/s, v_num=o30c, train/loss=3.880" ] }, { @@ -5665,7 +6393,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 26%[====> ] 432.82M 2.13MB/s eta 7m 31s " + "Epoch 0: 4%| | 574/14932 [01:01<25:29, 9.38it/s, v_num=o30c, train/loss=3.880\r", + "Epoch 0: 4%| | 574/14932 [01:01<25:29, 9.38it/s, v_num=o30c, train/loss=3.230" ] }, { @@ -5673,7 +6402,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 26%[====> ] 433.33M 2.15MB/s eta 7m 31s " + "Epoch 0: 4%| | 575/14932 [01:01<25:29, 9.39it/s, v_num=o30c, train/loss=3.230\r", + "Epoch 0: 4%| | 575/14932 [01:01<25:29, 9.39it/s, v_num=o30c, train/loss=4.720" ] }, { @@ -5681,7 +6411,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 26%[====> ] 433.85M 2.13MB/s eta 7m 30s " + "Epoch 0: 4%| | 576/14932 [01:01<25:30, 9.38it/s, v_num=o30c, train/loss=4.720\r", + "Epoch 0: 4%| | 576/14932 [01:01<25:30, 9.38it/s, v_num=o30c, train/loss=3.330" ] }, { @@ -5689,7 +6420,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 26%[====> ] 434.38M 2.16MB/s eta 7m 30s " + "Epoch 0: 4%| | 577/14932 [01:01<25:29, 9.38it/s, v_num=o30c, train/loss=3.330\r", + "Epoch 0: 4%| | 577/14932 [01:01<25:29, 9.38it/s, v_num=o30c, train/loss=3.980" ] }, { @@ -5697,7 +6429,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 26%[====> ] 434.91M 2.14MB/s eta 7m 30s " + "Epoch 0: 4%| | 578/14932 [01:01<25:28, 9.39it/s, v_num=o30c, train/loss=3.980\r", + "Epoch 0: 4%| | 578/14932 [01:01<25:28, 9.39it/s, v_num=o30c, train/loss=2.800" ] }, { @@ -5705,7 +6438,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 26%[====> ] 435.44M 2.17MB/s eta 7m 30s " + "Epoch 0: 4%| | 579/14932 [01:01<25:29, 9.39it/s, v_num=o30c, train/loss=2.800\r", + "Epoch 0: 4%| | 579/14932 [01:01<25:29, 9.39it/s, v_num=o30c, train/loss=3.170" ] }, { @@ -5713,7 +6447,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 26%[====> ] 435.97M 2.16MB/s eta 7m 30s " + "Epoch 0: 4%| | 580/14932 [01:01<25:28, 9.39it/s, v_num=o30c, train/loss=3.170\r", + "Epoch 0: 4%| | 580/14932 [01:01<25:28, 9.39it/s, v_num=o30c, train/loss=5.000" ] }, { @@ -5721,7 +6456,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 26%[====> ] 436.50M 2.19MB/s eta 7m 30s " + "Epoch 0: 4%| | 581/14932 [01:01<25:27, 9.40it/s, v_num=o30c, train/loss=5.000\r", + "Epoch 0: 4%| | 581/14932 [01:01<25:27, 9.40it/s, v_num=o30c, train/loss=3.610" ] }, { @@ -5729,7 +6465,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 26%[====> ] 437.05M 2.17MB/s eta 7m 30s " + "Epoch 0: 4%| | 582/14932 [01:01<25:26, 9.40it/s, v_num=o30c, train/loss=3.610\r", + "Epoch 0: 4%| | 582/14932 [01:01<25:26, 9.40it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -5737,7 +6474,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 26%[====> ] 437.61M 2.21MB/s eta 7m 30s " + "Epoch 0: 4%| | 583/14932 [01:02<25:27, 9.40it/s, v_num=o30c, train/loss=3.480\r", + "Epoch 0: 4%| | 583/14932 [01:02<25:27, 9.40it/s, v_num=o30c, train/loss=3.050" ] }, { @@ -5745,7 +6483,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 26%[====> ] 438.16M 2.20MB/s eta 7m 30s " + "Epoch 0: 4%| | 584/14932 [01:02<25:26, 9.40it/s, v_num=o30c, train/loss=3.050\r", + "Epoch 0: 4%| | 584/14932 [01:02<25:26, 9.40it/s, v_num=o30c, train/loss=4.750" ] }, { @@ -5753,7 +6492,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 26%[====> ] 438.74M 2.24MB/s eta 7m 30s " + "Epoch 0: 4%| | 585/14932 [01:02<25:26, 9.40it/s, v_num=o30c, train/loss=4.750\r", + "Epoch 0: 4%| | 585/14932 [01:02<25:26, 9.40it/s, v_num=o30c, train/loss=3.610" ] }, { @@ -5761,7 +6501,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 26%[====> ] 439.30M 2.23MB/s eta 7m 29s " + "Epoch 0: 4%| | 586/14932 [01:02<25:27, 9.39it/s, v_num=o30c, train/loss=3.610\r", + "Epoch 0: 4%| | 586/14932 [01:02<25:27, 9.39it/s, v_num=o30c, train/loss=4.880" ] }, { @@ -5769,7 +6510,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 26%[====> ] 439.89M 2.28MB/s eta 7m 29s " + "Epoch 0: 4%| | 587/14932 [01:02<25:26, 9.40it/s, v_num=o30c, train/loss=4.880\r", + "Epoch 0: 4%| | 587/14932 [01:02<25:26, 9.40it/s, v_num=o30c, train/loss=4.340" ] }, { @@ -5777,7 +6519,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 26%[====> ] 440.49M 2.28MB/s eta 7m 29s " + "Epoch 0: 4%| | 588/14932 [01:02<25:25, 9.40it/s, v_num=o30c, train/loss=4.340\r", + "Epoch 0: 4%| | 588/14932 [01:02<25:25, 9.40it/s, v_num=o30c, train/loss=4.250" ] }, { @@ -5785,7 +6528,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 26%[====> ] 441.10M 2.34MB/s eta 7m 29s " + "Epoch 0: 4%| | 589/14932 [01:02<25:24, 9.41it/s, v_num=o30c, train/loss=4.250\r", + "Epoch 0: 4%| | 589/14932 [01:02<25:24, 9.41it/s, v_num=o30c, train/loss=3.580" ] }, { @@ -5793,7 +6537,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 26%[====> ] 441.71M 2.34MB/s eta 7m 29s " + "Epoch 0: 4%| | 590/14932 [01:02<25:23, 9.41it/s, v_num=o30c, train/loss=3.580\r", + "Epoch 0: 4%| | 590/14932 [01:02<25:23, 9.41it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -5801,7 +6546,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 26%[====> ] 442.35M 2.40MB/s eta 7m 28s " + "Epoch 0: 4%| | 591/14932 [01:02<25:22, 9.42it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 4%| | 591/14932 [01:02<25:22, 9.42it/s, v_num=o30c, train/loss=3.670" ] }, { @@ -5809,7 +6555,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 26%[====> ] 442.99M 2.40MB/s eta 7m 28s " + "Epoch 0: 4%| | 592/14932 [01:02<25:21, 9.42it/s, v_num=o30c, train/loss=3.670\r", + "Epoch 0: 4%| | 592/14932 [01:02<25:21, 9.42it/s, v_num=o30c, train/loss=4.160" ] }, { @@ -5817,7 +6564,7 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 27%[====> ] 443.66M 2.48MB/s eta 7m 28s " + "Epoch 0: 4%| | 593/14932 [01:02<25:20, 9.43it/s, v_num=o30c, train/loss=4.160" ] }, { @@ -5825,7 +6572,7 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 27%[====> ] 444.33M 2.49MB/s eta 7m 28s " + "Epoch 0: 4%| | 593/14932 [01:02<25:20, 9.43it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -5833,7 +6580,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 27%[====> ] 445.04M 2.57MB/s eta 7m 28s " + "Epoch 0: 4%| | 594/14932 [01:02<25:19, 9.43it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 4%| | 594/14932 [01:02<25:19, 9.43it/s, v_num=o30c, train/loss=4.380" ] }, { @@ -5841,7 +6589,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 27%[====> ] 445.75M 2.59MB/s eta 7m 27s " + "Epoch 0: 4%| | 595/14932 [01:03<25:18, 9.44it/s, v_num=o30c, train/loss=4.380\r", + "Epoch 0: 4%| | 595/14932 [01:03<25:18, 9.44it/s, v_num=o30c, train/loss=3.390" ] }, { @@ -5849,7 +6598,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 27%[====> ] 446.49M 2.72MB/s eta 7m 27s " + "Epoch 0: 4%| | 596/14932 [01:03<25:17, 9.44it/s, v_num=o30c, train/loss=3.390\r", + "Epoch 0: 4%| | 596/14932 [01:03<25:17, 9.44it/s, v_num=o30c, train/loss=3.140" ] }, { @@ -5857,7 +6607,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 27%[====> ] 447.14M 2.73MB/s eta 7m 27s " + "Epoch 0: 4%| | 597/14932 [01:03<25:18, 9.44it/s, v_num=o30c, train/loss=3.140\r", + "Epoch 0: 4%| | 597/14932 [01:03<25:18, 9.44it/s, v_num=o30c, train/loss=4.060" ] }, { @@ -5865,7 +6616,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 27%[====> ] 447.88M 2.86MB/s eta 7m 27s " + "Epoch 0: 4%| | 598/14932 [01:03<25:17, 9.44it/s, v_num=o30c, train/loss=4.060\r", + "Epoch 0: 4%| | 598/14932 [01:03<25:17, 9.44it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -5873,7 +6625,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 27%[====> ] 448.66M 2.92MB/s eta 7m 27s " + "Epoch 0: 4%| | 599/14932 [01:03<25:16, 9.45it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 4%| | 599/14932 [01:03<25:16, 9.45it/s, v_num=o30c, train/loss=3.300" ] }, { @@ -5881,7 +6634,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 27%[====> ] 449.43M 2.95MB/s eta 7m 24s " + "Epoch 0: 4%| | 600/14932 [01:03<25:16, 9.45it/s, v_num=o30c, train/loss=3.300\r", + "Epoch 0: 4%| | 600/14932 [01:03<25:16, 9.45it/s, v_num=o30c, train/loss=2.700" ] }, { @@ -5889,7 +6643,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 27%[====> ] 450.21M 3.08MB/s eta 7m 24s " + "Epoch 0: 4%| | 601/14932 [01:03<25:15, 9.46it/s, v_num=o30c, train/loss=2.700\r", + "Epoch 0: 4%| | 601/14932 [01:03<25:15, 9.46it/s, v_num=o30c, train/loss=1.810" ] }, { @@ -5897,7 +6652,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 27%[====> ] 450.72M 3.03MB/s eta 7m 24s " + "Epoch 0: 4%| | 602/14932 [01:03<25:14, 9.46it/s, v_num=o30c, train/loss=1.810\r", + "Epoch 0: 4%| | 602/14932 [01:03<25:14, 9.46it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -5905,7 +6661,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 27%[====> ] 451.47M 3.14MB/s eta 7m 24s " + "Epoch 0: 4%| | 603/14932 [01:03<25:13, 9.47it/s, v_num=o30c, train/loss=3.480\r", + "Epoch 0: 4%| | 603/14932 [01:03<25:13, 9.47it/s, v_num=o30c, train/loss=3.940" ] }, { @@ -5913,7 +6670,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 27%[====> ] 452.39M 3.19MB/s eta 7m 24s " + "Epoch 0: 4%| | 604/14932 [01:03<25:12, 9.47it/s, v_num=o30c, train/loss=3.940\r", + "Epoch 0: 4%| | 604/14932 [01:03<25:12, 9.47it/s, v_num=o30c, train/loss=3.020" ] }, { @@ -5921,7 +6679,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 27%[====> ] 453.35M 3.34MB/s eta 7m 22s " + "Epoch 0: 4%| | 605/14932 [01:03<25:11, 9.48it/s, v_num=o30c, train/loss=3.020\r", + "Epoch 0: 4%| | 605/14932 [01:03<25:11, 9.48it/s, v_num=o30c, train/loss=2.830" ] }, { @@ -5929,7 +6688,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 27%[====> ] 454.35M 3.39MB/s eta 7m 22s " + "Epoch 0: 4%| | 606/14932 [01:03<25:10, 9.48it/s, v_num=o30c, train/loss=2.830\r", + "Epoch 0: 4%| | 606/14932 [01:03<25:10, 9.48it/s, v_num=o30c, train/loss=3.050" ] }, { @@ -5937,7 +6697,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 27%[====> ] 455.38M 3.55MB/s eta 7m 22s " + "Epoch 0: 4%| | 607/14932 [01:03<25:09, 9.49it/s, v_num=o30c, train/loss=3.050\r", + "Epoch 0: 4%| | 607/14932 [01:03<25:09, 9.49it/s, v_num=o30c, train/loss=3.520" ] }, { @@ -5945,7 +6706,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 27%[====> ] 456.46M 3.62MB/s eta 7m 22s " + "Epoch 0: 4%| | 608/14932 [01:04<25:11, 9.48it/s, v_num=o30c, train/loss=3.520\r", + "Epoch 0: 4%| | 608/14932 [01:04<25:11, 9.48it/s, v_num=o30c, train/loss=4.160" ] }, { @@ -5953,7 +6715,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 27%[====> ] 457.57M 3.80MB/s eta 7m 22s " + "Epoch 0: 4%| | 609/14932 [01:04<25:12, 9.47it/s, v_num=o30c, train/loss=4.160\r", + "Epoch 0: 4%| | 609/14932 [01:04<25:12, 9.47it/s, v_num=o30c, train/loss=4.160" ] }, { @@ -5961,7 +6724,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 27%[====> ] 458.71M 3.89MB/s eta 7m 18s " + "Epoch 0: 4%| | 610/14932 [01:04<25:11, 9.48it/s, v_num=o30c, train/loss=4.160\r", + "Epoch 0: 4%| | 610/14932 [01:04<25:11, 9.48it/s, v_num=o30c, train/loss=3.340" ] }, { @@ -5969,7 +6733,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 28%[====> ] 459.79M 4.08MB/s eta 7m 18s " + "Epoch 0: 4%| | 611/14932 [01:04<25:10, 9.48it/s, v_num=o30c, train/loss=3.340\r", + "Epoch 0: 4%| | 611/14932 [01:04<25:10, 9.48it/s, v_num=o30c, train/loss=4.530" ] }, { @@ -5977,7 +6742,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 28%[====> ] 460.97M 4.19MB/s eta 7m 18s " + "Epoch 0: 4%| | 612/14932 [01:04<25:09, 9.49it/s, v_num=o30c, train/loss=4.530\r", + "Epoch 0: 4%| | 612/14932 [01:04<25:09, 9.49it/s, v_num=o30c, train/loss=2.940" ] }, { @@ -5985,7 +6751,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 28%[====> ] 462.18M 4.31MB/s eta 7m 18s " + "Epoch 0: 4%| | 613/14932 [01:04<25:08, 9.49it/s, v_num=o30c, train/loss=2.940\r", + "Epoch 0: 4%| | 613/14932 [01:04<25:08, 9.49it/s, v_num=o30c, train/loss=3.250" ] }, { @@ -5993,7 +6760,8 @@ "output_type": "stream", "text": [ "\r", - "pth 28%[====> ] 463.38M 4.55MB/s eta 7m 18s " + "Epoch 0: 4%| | 614/14932 [01:04<25:07, 9.50it/s, v_num=o30c, train/loss=3.250\r", + "Epoch 0: 4%| | 614/14932 [01:04<25:07, 9.50it/s, v_num=o30c, train/loss=3.590" ] }, { @@ -6001,7 +6769,8 @@ "output_type": "stream", "text": [ "\r", - "th 28%[====> ] 464.72M 4.69MB/s eta 7m 13s " + "Epoch 0: 4%| | 615/14932 [01:04<25:07, 9.50it/s, v_num=o30c, train/loss=3.590\r", + "Epoch 0: 4%| | 615/14932 [01:04<25:07, 9.50it/s, v_num=o30c, train/loss=3.030" ] }, { @@ -6009,7 +6778,8 @@ "output_type": "stream", "text": [ "\r", - "h 28%[====> ] 465.94M 4.78MB/s eta 7m 13s " + "Epoch 0: 4%| | 616/14932 [01:04<25:06, 9.50it/s, v_num=o30c, train/loss=3.030\r", + "Epoch 0: 4%| | 616/14932 [01:04<25:06, 9.50it/s, v_num=o30c, train/loss=3.860" ] }, { @@ -6017,7 +6787,8 @@ "output_type": "stream", "text": [ "\r", - " 28%[====> ] 466.93M 4.93MB/s eta 7m 13s " + "Epoch 0: 4%| | 617/14932 [01:04<25:05, 9.51it/s, v_num=o30c, train/loss=3.860\r", + "Epoch 0: 4%| | 617/14932 [01:04<25:05, 9.51it/s, v_num=o30c, train/loss=4.840" ] }, { @@ -6025,7 +6796,8 @@ "output_type": "stream", "text": [ "\r", - " v 28%[====> ] 468.14M 4.99MB/s eta 7m 13s " + "Epoch 0: 4%| | 618/14932 [01:04<25:04, 9.51it/s, v_num=o30c, train/loss=4.840\r", + "Epoch 0: 4%| | 618/14932 [01:04<25:04, 9.51it/s, v_num=o30c, train/loss=2.800" ] }, { @@ -6033,7 +6805,8 @@ "output_type": "stream", "text": [ "\r", - " v5 28%[====> ] 469.71M 5.26MB/s eta 7m 13s " + "Epoch 0: 4%| | 619/14932 [01:05<25:03, 9.52it/s, v_num=o30c, train/loss=2.800\r", + "Epoch 0: 4%| | 619/14932 [01:05<25:03, 9.52it/s, v_num=o30c, train/loss=3.810" ] }, { @@ -6041,7 +6814,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 28%[====> ] 471.33M 5.38MB/s eta 7m 7s " + "Epoch 0: 4%| | 620/14932 [01:05<25:03, 9.52it/s, v_num=o30c, train/loss=3.810\r", + "Epoch 0: 4%| | 620/14932 [01:05<25:03, 9.52it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -6049,7 +6823,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 28%[====> ] 473.02M 5.68MB/s eta 7m 7s " + "Epoch 0: 4%| | 621/14932 [01:05<25:02, 9.53it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 4%| | 621/14932 [01:05<25:02, 9.53it/s, v_num=o30c, train/loss=3.450" ] }, { @@ -6057,7 +6832,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 28%[====> ] 474.77M 5.84MB/s eta 7m 7s " + "Epoch 0: 4%| | 622/14932 [01:05<25:01, 9.53it/s, v_num=o30c, train/loss=3.450\r", + "Epoch 0: 4%| | 622/14932 [01:05<25:01, 9.53it/s, v_num=o30c, train/loss=2.950" ] }, { @@ -6065,7 +6841,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 29%[====> ] 476.38M 6.12MB/s eta 7m 7s " + "Epoch 0: 4%| | 623/14932 [01:05<25:00, 9.54it/s, v_num=o30c, train/loss=2.950\r", + "Epoch 0: 4%| | 623/14932 [01:05<25:00, 9.54it/s, v_num=o30c, train/loss=3.030" ] }, { @@ -6073,7 +6850,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 29%[====> ] 478.21M 6.31MB/s eta 7m 7s " + "Epoch 0: 4%| | 624/14932 [01:05<24:59, 9.54it/s, v_num=o30c, train/loss=3.030\r", + "Epoch 0: 4%| | 624/14932 [01:05<24:59, 9.54it/s, v_num=o30c, train/loss=3.390" ] }, { @@ -6081,7 +6859,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 29%[====> ] 480.04M 6.66MB/s eta 6m 59s " + "Epoch 0: 4%| | 625/14932 [01:05<24:59, 9.54it/s, v_num=o30c, train/loss=3.390\r", + "Epoch 0: 4%| | 625/14932 [01:05<24:59, 9.54it/s, v_num=o30c, train/loss=3.530" ] }, { @@ -6089,7 +6868,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 29%[====> ] 481.88M 6.84MB/s eta 6m 59s " + "Epoch 0: 4%| | 626/14932 [01:05<24:58, 9.55it/s, v_num=o30c, train/loss=3.530\r", + "Epoch 0: 4%| | 626/14932 [01:05<24:58, 9.55it/s, v_num=o30c, train/loss=3.520" ] }, { @@ -6097,7 +6877,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 29%[====> ] 483.93M 7.04MB/s eta 6m 59s " + "Epoch 0: 4%| | 627/14932 [01:05<24:57, 9.55it/s, v_num=o30c, train/loss=3.520\r", + "Epoch 0: 4%| | 627/14932 [01:05<24:57, 9.55it/s, v_num=o30c, train/loss=1.770" ] }, { @@ -6105,7 +6886,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 29%[====> ] 485.64M 7.36MB/s eta 6m 59s " + "Epoch 0: 4%| | 628/14932 [01:05<24:56, 9.56it/s, v_num=o30c, train/loss=1.770\r", + "Epoch 0: 4%| | 628/14932 [01:05<24:56, 9.56it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -6113,7 +6895,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 29%[====> ] 487.11M 7.37MB/s eta 6m 59s " + "Epoch 0: 4%| | 629/14932 [01:05<24:55, 9.56it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 4%| | 629/14932 [01:05<24:55, 9.56it/s, v_num=o30c, train/loss=3.840" ] }, { @@ -6121,7 +6904,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 29%[====> ] 489.08M 7.71MB/s eta 6m 50s " + "Epoch 0: 4%| | 630/14932 [01:05<24:54, 9.57it/s, v_num=o30c, train/loss=3.840\r", + "Epoch 0: 4%| | 630/14932 [01:05<24:54, 9.57it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -6129,7 +6913,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 29%[====> ] 491.21M 7.89MB/s eta 6m 50s " + "Epoch 0: 4%| | 631/14932 [01:05<24:54, 9.57it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 4%| | 631/14932 [01:05<24:54, 9.57it/s, v_num=o30c, train/loss=3.170" ] }, { @@ -6137,7 +6922,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 30%[=====> ] 493.55M 8.33MB/s eta 6m 50s " + "Epoch 0: 4%| | 632/14932 [01:06<24:53, 9.58it/s, v_num=o30c, train/loss=3.170\r", + "Epoch 0: 4%| | 632/14932 [01:06<24:53, 9.58it/s, v_num=o30c, train/loss=3.730" ] }, { @@ -6145,7 +6931,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 30%[=====> ] 495.94M 8.57MB/s eta 6m 50s " + "Epoch 0: 4%| | 633/14932 [01:06<24:52, 9.58it/s, v_num=o30c, train/loss=3.730\r", + "Epoch 0: 4%| | 633/14932 [01:06<24:52, 9.58it/s, v_num=o30c, train/loss=4.470" ] }, { @@ -6153,7 +6940,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 30%[=====> ] 498.32M 8.77MB/s eta 6m 50s " + "Epoch 0: 4%| | 634/14932 [01:06<24:51, 9.58it/s, v_num=o30c, train/loss=4.470\r", + "Epoch 0: 4%| | 634/14932 [01:06<24:51, 9.58it/s, v_num=o30c, train/loss=3.880" ] }, { @@ -6161,7 +6949,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 30%[=====> ] 500.94M 9.30MB/s eta 6m 39s " + "Epoch 0: 4%| | 635/14932 [01:06<24:51, 9.59it/s, v_num=o30c, train/loss=3.880\r", + "Epoch 0: 4%| | 635/14932 [01:06<24:51, 9.59it/s, v_num=o30c, train/loss=3.700" ] }, { @@ -6169,7 +6958,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 30%[=====> ] 502.96M 9.34MB/s eta 6m 39s " + "Epoch 0: 4%| | 636/14932 [01:06<24:50, 9.59it/s, v_num=o30c, train/loss=3.700\r", + "Epoch 0: 4%| | 636/14932 [01:06<24:50, 9.59it/s, v_num=o30c, train/loss=2.720" ] }, { @@ -6177,7 +6967,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 30%[=====> ] 504.88M 9.61MB/s eta 6m 39s " + "Epoch 0: 4%| | 637/14932 [01:06<24:49, 9.60it/s, v_num=o30c, train/loss=2.720\r", + "Epoch 0: 4%| | 637/14932 [01:06<24:49, 9.60it/s, v_num=o30c, train/loss=3.190" ] }, { @@ -6185,7 +6976,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 30%[=====> ] 507.41M 9.80MB/s eta 6m 39s " + "Epoch 0: 4%| | 638/14932 [01:06<24:48, 9.60it/s, v_num=o30c, train/loss=3.190\r", + "Epoch 0: 4%| | 638/14932 [01:06<24:48, 9.60it/s, v_num=o30c, train/loss=3.140" ] }, { @@ -6193,7 +6985,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 31%[=====> ] 510.45M 10.3MB/s eta 6m 39s " + "Epoch 0: 4%| | 639/14932 [01:06<24:48, 9.60it/s, v_num=o30c, train/loss=3.140\r", + "Epoch 0: 4%| | 639/14932 [01:06<24:48, 9.60it/s, v_num=o30c, train/loss=3.660" ] }, { @@ -6201,7 +6994,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 31%[=====> ] 512.55M 10.4MB/s eta 6m 28s " + "Epoch 0: 4%| | 640/14932 [01:06<24:50, 9.59it/s, v_num=o30c, train/loss=3.660\r", + "Epoch 0: 4%| | 640/14932 [01:06<24:50, 9.59it/s, v_num=o30c, train/loss=3.470" ] }, { @@ -6209,7 +7003,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 31%[=====> ] 515.80M 10.7MB/s eta 6m 28s " + "Epoch 0: 4%| | 641/14932 [01:06<24:50, 9.59it/s, v_num=o30c, train/loss=3.470\r", + "Epoch 0: 4%| | 641/14932 [01:06<24:50, 9.59it/s, v_num=o30c, train/loss=4.470" ] }, { @@ -6217,7 +7012,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 31%[=====> ] 519.18M 11.2MB/s eta 6m 28s " + "Epoch 0: 4%| | 642/14932 [01:06<24:49, 9.59it/s, v_num=o30c, train/loss=4.470\r", + "Epoch 0: 4%| | 642/14932 [01:06<24:49, 9.59it/s, v_num=o30c, train/loss=3.000" ] }, { @@ -6225,7 +7021,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 31%[=====> ] 522.54M 11.4MB/s eta 6m 28s " + "Epoch 0: 4%| | 643/14932 [01:07<24:49, 9.60it/s, v_num=o30c, train/loss=3.000\r", + "Epoch 0: 4%| | 643/14932 [01:07<24:49, 9.60it/s, v_num=o30c, train/loss=3.020" ] }, { @@ -6233,7 +7030,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 32%[=====> ] 526.02M 12.0MB/s eta 6m 28s " + "Epoch 0: 4%| | 644/14932 [01:07<24:48, 9.60it/s, v_num=o30c, train/loss=3.020\r", + "Epoch 0: 4%| | 644/14932 [01:07<24:48, 9.60it/s, v_num=o30c, train/loss=4.440" ] }, { @@ -6241,7 +7039,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 32%[=====> ] 529.60M 12.1MB/s eta 6m 12s " + "Epoch 0: 4%| | 645/14932 [01:07<24:47, 9.61it/s, v_num=o30c, train/loss=4.440\r", + "Epoch 0: 4%| | 645/14932 [01:07<24:47, 9.61it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -6249,7 +7048,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 32%[=====> ] 533.19M 12.5MB/s eta 6m 12s " + "Epoch 0: 4%| | 646/14932 [01:07<24:46, 9.61it/s, v_num=o30c, train/loss=3.480\r", + "Epoch 0: 4%| | 646/14932 [01:07<24:46, 9.61it/s, v_num=o30c, train/loss=2.800" ] }, { @@ -6257,7 +7057,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 32%[=====> ] 536.99M 12.7MB/s eta 6m 12s " + "Epoch 0: 4%| | 647/14932 [01:07<24:45, 9.62it/s, v_num=o30c, train/loss=2.800\r", + "Epoch 0: 4%| | 647/14932 [01:07<24:45, 9.61it/s, v_num=o30c, train/loss=4.250" ] }, { @@ -6265,7 +7066,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 32%[=====> ] 538.91M 11.7MB/s eta 6m 12s " + "Epoch 0: 4%| | 648/14932 [01:07<24:44, 9.62it/s, v_num=o30c, train/loss=4.250\r", + "Epoch 0: 4%| | 648/14932 [01:07<24:44, 9.62it/s, v_num=o30c, train/loss=2.700" ] }, { @@ -6273,7 +7075,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 33%[=====> ] 542.74M 12.1MB/s eta 6m 2s " + "Epoch 0: 4%| | 649/14932 [01:07<24:43, 9.63it/s, v_num=o30c, train/loss=2.700\r", + "Epoch 0: 4%| | 649/14932 [01:07<24:43, 9.63it/s, v_num=o30c, train/loss=3.140" ] }, { @@ -6281,7 +7084,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 33%[=====> ] 545.64M 12.0MB/s eta 6m 2s " + "Epoch 0: 4%| | 650/14932 [01:07<24:43, 9.63it/s, v_num=o30c, train/loss=3.140\r", + "Epoch 0: 4%| | 650/14932 [01:07<24:43, 9.63it/s, v_num=o30c, train/loss=3.410" ] }, { @@ -6289,7 +7093,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 33%[=====> ] 548.72M 12.2MB/s eta 6m 2s " + "Epoch 0: 4%| | 651/14932 [01:07<24:42, 9.63it/s, v_num=o30c, train/loss=3.410\r", + "Epoch 0: 4%| | 651/14932 [01:07<24:42, 9.63it/s, v_num=o30c, train/loss=3.950" ] }, { @@ -6297,7 +7102,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 33%[=====> ] 551.86M 12.3MB/s eta 6m 2s " + "Epoch 0: 4%| | 652/14932 [01:07<24:41, 9.64it/s, v_num=o30c, train/loss=3.950\r", + "Epoch 0: 4%| | 652/14932 [01:07<24:41, 9.64it/s, v_num=o30c, train/loss=2.550" ] }, { @@ -6305,7 +7111,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 33%[=====> ] 555.10M 12.3MB/s eta 6m 2s " + "Epoch 0: 4%| | 653/14932 [01:07<24:40, 9.64it/s, v_num=o30c, train/loss=2.550\r", + "Epoch 0: 4%| | 653/14932 [01:07<24:40, 9.64it/s, v_num=o30c, train/loss=2.700" ] }, { @@ -6313,7 +7120,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 34%[=====> ] 558.36M 12.6MB/s eta 5m 49s " + "Epoch 0: 4%| | 654/14932 [01:07<24:40, 9.65it/s, v_num=o30c, train/loss=2.700\r", + "Epoch 0: 4%| | 654/14932 [01:07<24:40, 9.65it/s, v_num=o30c, train/loss=3.340" ] }, { @@ -6321,7 +7129,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 34%[=====> ] 561.68M 12.6MB/s eta 5m 49s " + "Epoch 0: 4%| | 655/14932 [01:07<24:39, 9.65it/s, v_num=o30c, train/loss=3.340\r", + "Epoch 0: 4%| | 655/14932 [01:07<24:39, 9.65it/s, v_num=o30c, train/loss=3.340" ] }, { @@ -6329,7 +7138,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 34%[=====> ] 563.79M 11.9MB/s eta 5m 49s " + "Epoch 0: 4%| | 656/14932 [01:07<24:38, 9.65it/s, v_num=o30c, train/loss=3.340\r", + "Epoch 0: 4%| | 656/14932 [01:07<24:38, 9.65it/s, v_num=o30c, train/loss=4.030" ] }, { @@ -6337,7 +7147,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 34%[=====> ] 567.58M 12.0MB/s eta 5m 49s " + "Epoch 0: 4%| | 657/14932 [01:08<24:37, 9.66it/s, v_num=o30c, train/loss=4.030\r", + "Epoch 0: 4%| | 657/14932 [01:08<24:37, 9.66it/s, v_num=o30c, train/loss=3.720" ] }, { @@ -6345,7 +7156,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 34%[=====> ] 570.00M 11.7MB/s eta 5m 40s " + "Epoch 0: 4%| | 658/14932 [01:08<24:37, 9.66it/s, v_num=o30c, train/loss=3.720\r", + "Epoch 0: 4%| | 658/14932 [01:08<24:37, 9.66it/s, v_num=o30c, train/loss=4.060" ] }, { @@ -6353,7 +7165,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 34%[=====> ] 572.47M 11.9MB/s eta 5m 40s " + "Epoch 0: 4%| | 659/14932 [01:08<24:36, 9.67it/s, v_num=o30c, train/loss=4.060\r", + "Epoch 0: 4%| | 659/14932 [01:08<24:36, 9.67it/s, v_num=o30c, train/loss=3.940" ] }, { @@ -6361,7 +7174,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 35%[======> ] 574.99M 11.6MB/s eta 5m 40s " + "Epoch 0: 4%| | 660/14932 [01:08<24:35, 9.67it/s, v_num=o30c, train/loss=3.940\r", + "Epoch 0: 4%| | 660/14932 [01:08<24:35, 9.67it/s, v_num=o30c, train/loss=2.270" ] }, { @@ -6369,7 +7183,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 35%[======> ] 577.54M 11.3MB/s eta 5m 40s " + "Epoch 0: 4%| | 661/14932 [01:08<24:34, 9.68it/s, v_num=o30c, train/loss=2.270\r", + "Epoch 0: 4%| | 661/14932 [01:08<24:34, 9.68it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -6377,7 +7192,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 35%[======> ] 580.10M 11.5MB/s eta 5m 40s " + "Epoch 0: 4%| | 662/14932 [01:08<24:33, 9.68it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 4%| | 662/14932 [01:08<24:33, 9.68it/s, v_num=o30c, train/loss=2.110" ] }, { @@ -6385,7 +7201,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 35%[======> ] 582.69M 11.2MB/s eta 5m 31s " + "Epoch 0: 4%| | 663/14932 [01:08<24:33, 9.69it/s, v_num=o30c, train/loss=2.110\r", + "Epoch 0: 4%| | 663/14932 [01:08<24:33, 9.69it/s, v_num=o30c, train/loss=2.950" ] }, { @@ -6393,7 +7210,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 35%[======> ] 585.30M 10.9MB/s eta 5m 31s " + "Epoch 0: 4%| | 664/14932 [01:08<24:32, 9.69it/s, v_num=o30c, train/loss=2.950\r", + "Epoch 0: 4%| | 664/14932 [01:08<24:32, 9.69it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -6401,7 +7219,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 35%[======> ] 587.05M 10.9MB/s eta 5m 31s " + "Epoch 0: 4%| | 665/14932 [01:08<24:31, 9.69it/s, v_num=o30c, train/loss=3.480\r", + "Epoch 0: 4%| | 665/14932 [01:08<24:31, 9.69it/s, v_num=o30c, train/loss=2.330" ] }, { @@ -6409,7 +7228,8 @@ "output_type": "stream", "text": [ "\r", - "pth 35%[======> ] 590.71M 10.8MB/s eta 5m 31s " + "Epoch 0: 4%| | 666/14932 [01:08<24:30, 9.70it/s, v_num=o30c, train/loss=2.330\r", + "Epoch 0: 4%| | 666/14932 [01:08<24:30, 9.70it/s, v_num=o30c, train/loss=3.700" ] }, { @@ -6417,7 +7237,8 @@ "output_type": "stream", "text": [ "\r", - "th 36%[======> ] 592.58M 10.5MB/s eta 5m 24s " + "Epoch 0: 4%| | 667/14932 [01:08<24:30, 9.70it/s, v_num=o30c, train/loss=3.700\r", + "Epoch 0: 4%| | 667/14932 [01:08<24:30, 9.70it/s, v_num=o30c, train/loss=3.770" ] }, { @@ -6425,7 +7246,8 @@ "output_type": "stream", "text": [ "\r", - "h 36%[======> ] 594.52M 11.0MB/s eta 5m 24s " + "Epoch 0: 4%| | 668/14932 [01:08<24:29, 9.71it/s, v_num=o30c, train/loss=3.770\r", + "Epoch 0: 4%| | 668/14932 [01:08<24:29, 9.71it/s, v_num=o30c, train/loss=2.940" ] }, { @@ -6433,7 +7255,8 @@ "output_type": "stream", "text": [ "\r", - " 36%[======> ] 596.46M 10.6MB/s eta 5m 24s " + "Epoch 0: 4%| | 669/14932 [01:08<24:28, 9.71it/s, v_num=o30c, train/loss=2.940\r", + "Epoch 0: 4%| | 669/14932 [01:08<24:28, 9.71it/s, v_num=o30c, train/loss=3.890" ] }, { @@ -6441,7 +7264,8 @@ "output_type": "stream", "text": [ "\r", - " v 36%[======> ] 598.43M 10.4MB/s eta 5m 24s " + "Epoch 0: 4%| | 670/14932 [01:08<24:27, 9.72it/s, v_num=o30c, train/loss=3.890\r", + "Epoch 0: 4%| | 670/14932 [01:08<24:27, 9.72it/s, v_num=o30c, train/loss=1.530" ] }, { @@ -6449,7 +7273,8 @@ "output_type": "stream", "text": [ "\r", - " v5 36%[======> ] 600.44M 10.2MB/s eta 5m 24s " + "Epoch 0: 4%| | 671/14932 [01:09<24:26, 9.72it/s, v_num=o30c, train/loss=1.530\r", + "Epoch 0: 4%| | 671/14932 [01:09<24:26, 9.72it/s, v_num=o30c, train/loss=4.340" ] }, { @@ -6457,7 +7282,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 36%[======> ] 602.46M 9.80MB/s eta 5m 18s " + "Epoch 0: 5%| | 672/14932 [01:09<24:28, 9.71it/s, v_num=o30c, train/loss=4.340\r", + "Epoch 0: 5%| | 672/14932 [01:09<24:28, 9.71it/s, v_num=o30c, train/loss=3.110" ] }, { @@ -6465,7 +7291,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 36%[======> ] 604.50M 9.55MB/s eta 5m 18s " + "Epoch 0: 5%| | 673/14932 [01:09<24:27, 9.71it/s, v_num=o30c, train/loss=3.110\r", + "Epoch 0: 5%| | 673/14932 [01:09<24:27, 9.71it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -6473,7 +7300,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 36%[======> ] 606.57M 9.29MB/s eta 5m 18s " + "Epoch 0: 5%| | 674/14932 [01:09<24:27, 9.72it/s, v_num=o30c, train/loss=3.480\r", + "Epoch 0: 5%| | 674/14932 [01:09<24:27, 9.72it/s, v_num=o30c, train/loss=3.080" ] }, { @@ -6481,7 +7309,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 37%[======> ] 608.66M 9.77MB/s eta 5m 18s " + "Epoch 0: 5%| | 675/14932 [01:09<24:26, 9.72it/s, v_num=o30c, train/loss=3.080\r", + "Epoch 0: 5%| | 675/14932 [01:09<24:26, 9.72it/s, v_num=o30c, train/loss=3.880" ] }, { @@ -6489,7 +7318,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 37%[======> ] 610.75M 9.40MB/s eta 5m 18s " + "Epoch 0: 5%| | 676/14932 [01:09<24:27, 9.72it/s, v_num=o30c, train/loss=3.880\r", + "Epoch 0: 5%| | 676/14932 [01:09<24:27, 9.72it/s, v_num=o30c, train/loss=3.550" ] }, { @@ -6497,7 +7327,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 37%[======> ] 612.88M 9.27MB/s eta 5m 11s " + "Epoch 0: 5%| | 677/14932 [01:09<24:26, 9.72it/s, v_num=o30c, train/loss=3.550\r", + "Epoch 0: 5%| | 677/14932 [01:09<24:26, 9.72it/s, v_num=o30c, train/loss=3.880" ] }, { @@ -6505,7 +7336,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 37%[======> ] 615.02M 9.19MB/s eta 5m 11s " + "Epoch 0: 5%| | 678/14932 [01:09<24:25, 9.73it/s, v_num=o30c, train/loss=3.880\r", + "Epoch 0: 5%| | 678/14932 [01:09<24:25, 9.72it/s, v_num=o30c, train/loss=2.390" ] }, { @@ -6513,7 +7345,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 37%[======> ] 617.18M 9.10MB/s eta 5m 11s " + "Epoch 0: 5%| | 679/14932 [01:09<24:25, 9.73it/s, v_num=o30c, train/loss=2.390\r", + "Epoch 0: 5%| | 679/14932 [01:09<24:25, 9.73it/s, v_num=o30c, train/loss=4.590" ] }, { @@ -6521,7 +7354,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 37%[======> ] 619.33M 9.00MB/s eta 5m 11s " + "Epoch 0: 5%| | 680/14932 [01:09<24:24, 9.73it/s, v_num=o30c, train/loss=4.590\r", + "Epoch 0: 5%| | 680/14932 [01:09<24:24, 9.73it/s, v_num=o30c, train/loss=2.910" ] }, { @@ -6529,7 +7363,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 37%[======> ] 621.50M 8.91MB/s eta 5m 11s " + "Epoch 0: 5%| | 681/14932 [01:09<24:23, 9.74it/s, v_num=o30c, train/loss=2.910\r", + "Epoch 0: 5%| | 681/14932 [01:09<24:23, 9.73it/s, v_num=o30c, train/loss=3.330" ] }, { @@ -6537,7 +7372,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 37%[======> ] 623.69M 9.09MB/s eta 5m 5s " + "Epoch 0: 5%| | 682/14932 [01:10<24:23, 9.74it/s, v_num=o30c, train/loss=3.330\r", + "Epoch 0: 5%| | 682/14932 [01:10<24:23, 9.74it/s, v_num=o30c, train/loss=4.720" ] }, { @@ -6545,7 +7381,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 38%[======> ] 625.89M 9.34MB/s eta 5m 5s " + "Epoch 0: 5%| | 683/14932 [01:10<24:22, 9.74it/s, v_num=o30c, train/loss=4.720\r", + "Epoch 0: 5%| | 683/14932 [01:10<24:22, 9.74it/s, v_num=o30c, train/loss=2.560" ] }, { @@ -6553,7 +7390,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 38%[======> ] 628.10M 9.05MB/s eta 5m 5s " + "Epoch 0: 5%| | 684/14932 [01:10<24:21, 9.75it/s, v_num=o30c, train/loss=2.560\r", + "Epoch 0: 5%| | 684/14932 [01:10<24:21, 9.75it/s, v_num=o30c, train/loss=2.410" ] }, { @@ -6561,7 +7399,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 38%[======> ] 630.32M 9.13MB/s eta 5m 5s " + "Epoch 0: 5%| | 685/14932 [01:10<24:21, 9.75it/s, v_num=o30c, train/loss=2.410\r", + "Epoch 0: 5%| | 685/14932 [01:10<24:21, 9.75it/s, v_num=o30c, train/loss=3.380" ] }, { @@ -6569,7 +7408,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 38%[======> ] 632.54M 9.20MB/s eta 5m 5s " + "Epoch 0: 5%| | 686/14932 [01:10<24:20, 9.76it/s, v_num=o30c, train/loss=3.380\r", + "Epoch 0: 5%| | 686/14932 [01:10<24:20, 9.76it/s, v_num=o30c, train/loss=3.360" ] }, { @@ -6577,7 +7417,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 38%[======> ] 634.77M 9.33MB/s eta 4m 58s " + "Epoch 0: 5%| | 687/14932 [01:10<24:19, 9.76it/s, v_num=o30c, train/loss=3.360\r", + "Epoch 0: 5%| | 687/14932 [01:10<24:19, 9.76it/s, v_num=o30c, train/loss=3.330" ] }, { @@ -6585,7 +7426,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 38%[======> ] 634.91M 8.69MB/s eta 4m 58s " + "Epoch 0: 5%| | 688/14932 [01:10<24:18, 9.76it/s, v_num=o30c, train/loss=3.330\r", + "Epoch 0: 5%| | 688/14932 [01:10<24:18, 9.76it/s, v_num=o30c, train/loss=3.720" ] }, { @@ -6593,7 +7435,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 38%[======> ] 635.42M 8.32MB/s eta 4m 58s " + "Epoch 0: 5%| | 689/14932 [01:10<24:18, 9.77it/s, v_num=o30c, train/loss=3.720\r", + "Epoch 0: 5%| | 689/14932 [01:10<24:18, 9.77it/s, v_num=o30c, train/loss=1.170" ] }, { @@ -6601,7 +7444,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 39%[======> ] 641.49M 9.27MB/s eta 4m 58s " + "Epoch 0: 5%| | 690/14932 [01:10<24:17, 9.77it/s, v_num=o30c, train/loss=1.170\r", + "Epoch 0: 5%| | 690/14932 [01:10<24:17, 9.77it/s, v_num=o30c, train/loss=2.910" ] }, { @@ -6609,7 +7453,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 39%[======> ] 643.49M 8.54MB/s eta 4m 53s " + "Epoch 0: 5%| | 691/14932 [01:10<24:16, 9.78it/s, v_num=o30c, train/loss=2.910\r", + "Epoch 0: 5%| | 691/14932 [01:10<24:16, 9.78it/s, v_num=o30c, train/loss=4.750" ] }, { @@ -6617,7 +7462,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 39%[======> ] 646.07M 9.05MB/s eta 4m 53s " + "Epoch 0: 5%| | 692/14932 [01:10<24:15, 9.78it/s, v_num=o30c, train/loss=4.750\r", + "Epoch 0: 5%| | 692/14932 [01:10<24:15, 9.78it/s, v_num=o30c, train/loss=2.730" ] }, { @@ -6625,7 +7471,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 39%[======> ] 647.64M 8.75MB/s eta 4m 53s " + "Epoch 0: 5%| | 693/14932 [01:10<24:15, 9.78it/s, v_num=o30c, train/loss=2.730\r", + "Epoch 0: 5%| | 693/14932 [01:10<24:15, 9.78it/s, v_num=o30c, train/loss=1.660" ] }, { @@ -6633,7 +7480,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 39%[======> ] 649.25M 8.57MB/s eta 4m 53s " + "Epoch 0: 5%| | 694/14932 [01:10<24:14, 9.79it/s, v_num=o30c, train/loss=1.660\r", + "Epoch 0: 5%| | 694/14932 [01:10<24:14, 9.79it/s, v_num=o30c, train/loss=3.660" ] }, { @@ -6641,7 +7489,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 39%[======> ] 650.89M 8.45MB/s eta 4m 53s " + "Epoch 0: 5%| | 695/14932 [01:10<24:13, 9.79it/s, v_num=o30c, train/loss=3.660\r", + "Epoch 0: 5%| | 695/14932 [01:10<24:13, 9.79it/s, v_num=o30c, train/loss=3.170" ] }, { @@ -6649,7 +7498,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 39%[======> ] 652.55M 8.23MB/s eta 4m 48s " + "Epoch 0: 5%| | 696/14932 [01:11<24:14, 9.79it/s, v_num=o30c, train/loss=3.170\r", + "Epoch 0: 5%| | 696/14932 [01:11<24:14, 9.79it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -6657,7 +7507,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 39%[======> ] 654.24M 8.51MB/s eta 4m 48s " + "Epoch 0: 5%| | 697/14932 [01:11<24:13, 9.79it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 5%| | 697/14932 [01:11<24:13, 9.79it/s, v_num=o30c, train/loss=3.380" ] }, { @@ -6665,7 +7516,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 39%[======> ] 655.96M 8.36MB/s eta 4m 48s " + "Epoch 0: 5%| | 698/14932 [01:11<24:13, 9.80it/s, v_num=o30c, train/loss=3.380\r", + "Epoch 0: 5%| | 698/14932 [01:11<24:13, 9.80it/s, v_num=o30c, train/loss=4.380" ] }, { @@ -6673,7 +7525,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 40%[=======> ] 657.68M 8.01MB/s eta 4m 48s " + "Epoch 0: 5%| | 699/14932 [01:11<24:12, 9.80it/s, v_num=o30c, train/loss=4.380\r", + "Epoch 0: 5%| | 699/14932 [01:11<24:12, 9.80it/s, v_num=o30c, train/loss=3.390" ] }, { @@ -6681,7 +7534,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 40%[=======> ] 658.88M 7.54MB/s eta 4m 45s " + "Epoch 0: 5%| | 700/14932 [01:11<24:11, 9.80it/s, v_num=o30c, train/loss=3.390\r", + "Epoch 0: 5%| | 700/14932 [01:11<24:11, 9.80it/s, v_num=o30c, train/loss=2.830" ] }, { @@ -6689,7 +7543,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 40%[=======> ] 662.04M 7.80MB/s eta 4m 45s " + "Epoch 0: 5%| | 701/14932 [01:11<24:12, 9.80it/s, v_num=o30c, train/loss=2.830\r", + "Epoch 0: 5%| | 701/14932 [01:11<24:12, 9.80it/s, v_num=o30c, train/loss=3.050" ] }, { @@ -6697,7 +7552,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 40%[=======> ] 663.30M 7.43MB/s eta 4m 45s " + "Epoch 0: 5%| | 702/14932 [01:11<24:11, 9.80it/s, v_num=o30c, train/loss=3.050\r", + "Epoch 0: 5%| | 702/14932 [01:11<24:11, 9.80it/s, v_num=o30c, train/loss=3.190" ] }, { @@ -6705,7 +7561,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 40%[=======> ] 664.58M 7.60MB/s eta 4m 45s " + "Epoch 0: 5%| | 703/14932 [01:11<24:10, 9.81it/s, v_num=o30c, train/loss=3.190\r", + "Epoch 0: 5%| | 703/14932 [01:11<24:10, 9.81it/s, v_num=o30c, train/loss=2.300" ] }, { @@ -6713,7 +7570,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 40%[=======> ] 665.88M 7.28MB/s eta 4m 45s " + "Epoch 0: 5%| | 704/14932 [01:11<24:12, 9.80it/s, v_num=o30c, train/loss=2.300\r", + "Epoch 0: 5%| | 704/14932 [01:11<24:12, 9.80it/s, v_num=o30c, train/loss=3.060" ] }, { @@ -6721,7 +7579,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 40%[=======> ] 667.18M 7.32MB/s eta 4m 41s " + "Epoch 0: 5%| | 705/14932 [01:11<24:11, 9.80it/s, v_num=o30c, train/loss=3.060\r", + "Epoch 0: 5%| | 705/14932 [01:11<24:11, 9.80it/s, v_num=o30c, train/loss=3.390" ] }, { @@ -6729,7 +7588,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 40%[=======> ] 668.39M 7.50MB/s eta 4m 41s " + "Epoch 0: 5%| | 706/14932 [01:12<24:11, 9.80it/s, v_num=o30c, train/loss=3.390\r", + "Epoch 0: 5%| | 706/14932 [01:12<24:11, 9.80it/s, v_num=o30c, train/loss=4.120" ] }, { @@ -6737,7 +7597,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 40%[=======> ] 669.86M 7.08MB/s eta 4m 41s " + "Epoch 0: 5%| | 707/14932 [01:12<24:12, 9.79it/s, v_num=o30c, train/loss=4.120\r", + "Epoch 0: 5%| | 707/14932 [01:12<24:12, 9.79it/s, v_num=o30c, train/loss=3.080" ] }, { @@ -6745,7 +7606,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 40%[=======> ] 671.22M 6.57MB/s eta 4m 41s " + "Epoch 0: 5%| | 708/14932 [01:12<24:12, 9.80it/s, v_num=o30c, train/loss=3.080\r", + "Epoch 0: 5%| | 708/14932 [01:12<24:12, 9.80it/s, v_num=o30c, train/loss=4.340" ] }, { @@ -6753,7 +7615,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 40%[=======> ] 672.13M 6.73MB/s eta 4m 41s " + "Epoch 0: 5%| | 709/14932 [01:12<24:11, 9.80it/s, v_num=o30c, train/loss=4.340\r", + "Epoch 0: 5%| | 709/14932 [01:12<24:11, 9.80it/s, v_num=o30c, train/loss=4.590" ] }, { @@ -6761,7 +7624,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 40%[=======> ] 672.91M 6.24MB/s eta 4m 39s " + "Epoch 0: 5%| | 710/14932 [01:12<24:10, 9.80it/s, v_num=o30c, train/loss=4.590\r", + "Epoch 0: 5%| | 710/14932 [01:12<24:11, 9.80it/s, v_num=o30c, train/loss=2.880" ] }, { @@ -6769,7 +7633,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 41%[=======> ] 674.30M 6.20MB/s eta 4m 39s " + "Epoch 0: 5%| | 711/14932 [01:12<24:10, 9.81it/s, v_num=o30c, train/loss=2.880\r", + "Epoch 0: 5%| | 711/14932 [01:12<24:10, 9.81it/s, v_num=o30c, train/loss=3.270" ] }, { @@ -6777,7 +7642,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 41%[=======> ] 675.71M 6.15MB/s eta 4m 39s " + "Epoch 0: 5%| | 712/14932 [01:12<24:09, 9.81it/s, v_num=o30c, train/loss=3.270\r", + "Epoch 0: 5%| | 712/14932 [01:12<24:09, 9.81it/s, v_num=o30c, train/loss=3.440" ] }, { @@ -6785,7 +7651,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 41%[=======> ] 677.13M 6.02MB/s eta 4m 39s " + "Epoch 0: 5%| | 713/14932 [01:12<24:08, 9.81it/s, v_num=o30c, train/loss=3.440\r", + "Epoch 0: 5%| | 713/14932 [01:12<24:08, 9.81it/s, v_num=o30c, train/loss=4.470" ] }, { @@ -6793,7 +7660,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 41%[=======> ] 678.57M 5.92MB/s eta 4m 39s " + "Epoch 0: 5%| | 714/14932 [01:12<24:08, 9.82it/s, v_num=o30c, train/loss=4.470\r", + "Epoch 0: 5%| | 714/14932 [01:12<24:08, 9.82it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -6801,7 +7669,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 41%[=======> ] 679.99M 5.95MB/s eta 4m 35s " + "Epoch 0: 5%| | 715/14932 [01:12<24:07, 9.82it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 5%| | 715/14932 [01:12<24:07, 9.82it/s, v_num=o30c, train/loss=3.730" ] }, { @@ -6809,7 +7678,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 41%[=======> ] 681.44M 6.23MB/s eta 4m 35s " + "Epoch 0: 5%| | 716/14932 [01:12<24:06, 9.83it/s, v_num=o30c, train/loss=3.730\r", + "Epoch 0: 5%| | 716/14932 [01:12<24:06, 9.83it/s, v_num=o30c, train/loss=2.920" ] }, { @@ -6817,7 +7687,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 41%[=======> ] 682.89M 5.68MB/s eta 4m 35s " + "Epoch 0: 5%| | 717/14932 [01:12<24:06, 9.83it/s, v_num=o30c, train/loss=2.920\r", + "Epoch 0: 5%| | 717/14932 [01:12<24:06, 9.83it/s, v_num=o30c, train/loss=4.060" ] }, { @@ -6825,7 +7696,8 @@ "output_type": "stream", "text": [ "\r", - "pth 41%[=======> ] 684.35M 5.92MB/s eta 4m 35s " + "Epoch 0: 5%| | 718/14932 [01:13<24:05, 9.83it/s, v_num=o30c, train/loss=4.060\r", + "Epoch 0: 5%| | 718/14932 [01:13<24:05, 9.83it/s, v_num=o30c, train/loss=3.750" ] }, { @@ -6833,7 +7705,8 @@ "output_type": "stream", "text": [ "\r", - "th 41%[=======> ] 685.82M 5.89MB/s eta 4m 35s " + "Epoch 0: 5%| | 719/14932 [01:13<24:04, 9.84it/s, v_num=o30c, train/loss=3.750\r", + "Epoch 0: 5%| | 719/14932 [01:13<24:04, 9.84it/s, v_num=o30c, train/loss=3.800" ] }, { @@ -6841,7 +7714,8 @@ "output_type": "stream", "text": [ "\r", - "h 41%[=======> ] 687.29M 5.91MB/s eta 4m 32s " + "Epoch 0: 5%| | 720/14932 [01:13<24:04, 9.84it/s, v_num=o30c, train/loss=3.800\r", + "Epoch 0: 5%| | 720/14932 [01:13<24:04, 9.84it/s, v_num=o30c, train/loss=3.030" ] }, { @@ -6849,7 +7723,8 @@ "output_type": "stream", "text": [ "\r", - " 41%[=======> ] 688.75M 6.09MB/s eta 4m 32s " + "Epoch 0: 5%| | 721/14932 [01:13<24:04, 9.84it/s, v_num=o30c, train/loss=3.030\r", + "Epoch 0: 5%| | 721/14932 [01:13<24:04, 9.84it/s, v_num=o30c, train/loss=4.910" ] }, { @@ -6857,7 +7732,8 @@ "output_type": "stream", "text": [ "\r", - " v 42%[=======> ] 690.24M 6.02MB/s eta 4m 32s " + "Epoch 0: 5%| | 722/14932 [01:13<24:05, 9.83it/s, v_num=o30c, train/loss=4.910\r", + "Epoch 0: 5%| | 722/14932 [01:13<24:05, 9.83it/s, v_num=o30c, train/loss=3.800" ] }, { @@ -6865,7 +7741,8 @@ "output_type": "stream", "text": [ "\r", - " v5 42%[=======> ] 691.72M 6.05MB/s eta 4m 32s " + "Epoch 0: 5%| | 723/14932 [01:13<24:04, 9.84it/s, v_num=o30c, train/loss=3.800\r", + "Epoch 0: 5%| | 723/14932 [01:13<24:04, 9.84it/s, v_num=o30c, train/loss=2.780" ] }, { @@ -6873,7 +7750,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 42%[=======> ] 693.19M 6.16MB/s eta 4m 32s " + "Epoch 0: 5%| | 724/14932 [01:13<24:03, 9.84it/s, v_num=o30c, train/loss=2.780\r", + "Epoch 0: 5%| | 724/14932 [01:13<24:03, 9.84it/s, v_num=o30c, train/loss=1.560" ] }, { @@ -6881,7 +7759,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 42%[=======> ] 694.69M 6.11MB/s eta 4m 29s " + "Epoch 0: 5%| | 725/14932 [01:13<24:03, 9.84it/s, v_num=o30c, train/loss=1.560\r", + "Epoch 0: 5%| | 725/14932 [01:13<24:03, 9.84it/s, v_num=o30c, train/loss=3.620" ] }, { @@ -6889,7 +7768,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 42%[=======> ] 696.18M 6.13MB/s eta 4m 29s " + "Epoch 0: 5%| | 726/14932 [01:13<24:03, 9.84it/s, v_num=o30c, train/loss=3.620\r", + "Epoch 0: 5%| | 726/14932 [01:13<24:03, 9.84it/s, v_num=o30c, train/loss=4.470" ] }, { @@ -6897,7 +7777,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 42%[=======> ] 697.66M 6.22MB/s eta 4m 29s " + "Epoch 0: 5%| | 727/14932 [01:13<24:03, 9.84it/s, v_num=o30c, train/loss=4.470\r", + "Epoch 0: 5%| | 727/14932 [01:13<24:03, 9.84it/s, v_num=o30c, train/loss=4.000" ] }, { @@ -6905,7 +7786,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 42%[=======> ] 699.16M 6.17MB/s eta 4m 29s " + "Epoch 0: 5%| | 728/14932 [01:13<24:02, 9.85it/s, v_num=o30c, train/loss=4.000\r", + "Epoch 0: 5%| | 728/14932 [01:13<24:02, 9.85it/s, v_num=o30c, train/loss=3.640" ] }, { @@ -6913,7 +7795,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 42%[=======> ] 700.66M 6.26MB/s eta 4m 29s " + "Epoch 0: 5%| | 729/14932 [01:13<24:01, 9.85it/s, v_num=o30c, train/loss=3.640\r", + "Epoch 0: 5%| | 729/14932 [01:13<24:01, 9.85it/s, v_num=o30c, train/loss=4.160" ] }, { @@ -6921,7 +7804,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 42%[=======> ] 702.14M 6.20MB/s eta 4m 25s " + "Epoch 0: 5%| | 730/14932 [01:14<24:00, 9.86it/s, v_num=o30c, train/loss=4.160\r", + "Epoch 0: 5%| | 730/14932 [01:14<24:01, 9.86it/s, v_num=o30c, train/loss=3.580" ] }, { @@ -6929,7 +7813,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 42%[=======> ] 703.64M 6.29MB/s eta 4m 25s " + "Epoch 0: 5%| | 731/14932 [01:14<24:00, 9.86it/s, v_num=o30c, train/loss=3.580\r", + "Epoch 0: 5%| | 731/14932 [01:14<24:00, 9.86it/s, v_num=o30c, train/loss=2.080" ] }, { @@ -6937,7 +7822,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 42%[=======> ] 705.14M 6.22MB/s eta 4m 25s " + "Epoch 0: 5%| | 732/14932 [01:14<23:59, 9.86it/s, v_num=o30c, train/loss=2.080\r", + "Epoch 0: 5%| | 732/14932 [01:14<23:59, 9.86it/s, v_num=o30c, train/loss=2.310" ] }, { @@ -6945,7 +7831,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 43%[=======> ] 706.63M 6.31MB/s eta 4m 25s " + "Epoch 0: 5%| | 733/14932 [01:14<24:00, 9.86it/s, v_num=o30c, train/loss=2.310\r", + "Epoch 0: 5%| | 733/14932 [01:14<24:00, 9.86it/s, v_num=o30c, train/loss=3.280" ] }, { @@ -6953,7 +7840,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 43%[=======> ] 708.13M 6.24MB/s eta 4m 25s " + "Epoch 0: 5%| | 734/14932 [01:14<23:59, 9.86it/s, v_num=o30c, train/loss=3.280\r", + "Epoch 0: 5%| | 734/14932 [01:14<23:59, 9.86it/s, v_num=o30c, train/loss=3.610" ] }, { @@ -6961,7 +7849,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 43%[=======> ] 709.63M 6.10MB/s eta 4m 22s " + "Epoch 0: 5%| | 735/14932 [01:14<23:58, 9.87it/s, v_num=o30c, train/loss=3.610\r", + "Epoch 0: 5%| | 735/14932 [01:14<23:58, 9.87it/s, v_num=o30c, train/loss=2.880" ] }, { @@ -6969,7 +7858,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 43%[=======> ] 711.11M 6.33MB/s eta 4m 22s " + "Epoch 0: 5%| | 736/14932 [01:14<24:00, 9.86it/s, v_num=o30c, train/loss=2.880\r", + "Epoch 0: 5%| | 736/14932 [01:14<24:00, 9.86it/s, v_num=o30c, train/loss=4.090" ] }, { @@ -6977,7 +7867,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 43%[=======> ] 712.60M 6.31MB/s eta 4m 22s " + "Epoch 0: 5%| | 737/14932 [01:14<23:59, 9.86it/s, v_num=o30c, train/loss=4.090\r", + "Epoch 0: 5%| | 737/14932 [01:14<23:59, 9.86it/s, v_num=o30c, train/loss=3.590" ] }, { @@ -6985,7 +7876,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 43%[=======> ] 714.10M 6.41MB/s eta 4m 22s " + "Epoch 0: 5%| | 738/14932 [01:14<23:58, 9.87it/s, v_num=o30c, train/loss=3.590\r", + "Epoch 0: 5%| | 738/14932 [01:14<23:58, 9.87it/s, v_num=o30c, train/loss=2.500" ] }, { @@ -6993,7 +7885,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 43%[=======> ] 715.60M 6.33MB/s eta 4m 22s " + "Epoch 0: 5%| | 739/14932 [01:14<23:58, 9.87it/s, v_num=o30c, train/loss=2.500\r", + "Epoch 0: 5%| | 739/14932 [01:14<23:58, 9.87it/s, v_num=o30c, train/loss=2.610" ] }, { @@ -7001,7 +7894,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 43%[=======> ] 717.08M 6.48MB/s eta 4m 18s " + "Epoch 0: 5%| | 740/14932 [01:14<23:57, 9.87it/s, v_num=o30c, train/loss=2.610\r", + "Epoch 0: 5%| | 740/14932 [01:14<23:57, 9.87it/s, v_num=o30c, train/loss=2.830" ] }, { @@ -7009,7 +7903,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 43%[=======> ] 718.32M 6.35MB/s eta 4m 18s " + "Epoch 0: 5%| | 741/14932 [01:15<23:56, 9.88it/s, v_num=o30c, train/loss=2.830\r", + "Epoch 0: 5%| | 741/14932 [01:15<23:56, 9.88it/s, v_num=o30c, train/loss=2.730" ] }, { @@ -7017,7 +7912,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 43%[=======> ] 719.82M 6.31MB/s eta 4m 18s " + "Epoch 0: 5%| | 742/14932 [01:15<23:56, 9.88it/s, v_num=o30c, train/loss=2.730\r", + "Epoch 0: 5%| | 742/14932 [01:15<23:56, 9.88it/s, v_num=o30c, train/loss=2.840" ] }, { @@ -7025,7 +7921,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 43%[=======> ] 721.32M 6.42MB/s eta 4m 18s " + "Epoch 0: 5%| | 743/14932 [01:15<23:55, 9.88it/s, v_num=o30c, train/loss=2.840\r", + "Epoch 0: 5%| | 743/14932 [01:15<23:55, 9.88it/s, v_num=o30c, train/loss=3.410" ] }, { @@ -7033,7 +7930,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 44%[=======> ] 722.82M 6.35MB/s eta 4m 18s " + "Epoch 0: 5%| | 744/14932 [01:15<23:55, 9.89it/s, v_num=o30c, train/loss=3.410\r", + "Epoch 0: 5%| | 744/14932 [01:15<23:55, 9.89it/s, v_num=o30c, train/loss=2.980" ] }, { @@ -7041,7 +7939,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 44%[=======> ] 724.32M 6.47MB/s eta 4m 15s " + "Epoch 0: 5%| | 745/14932 [01:15<23:54, 9.89it/s, v_num=o30c, train/loss=2.980\r", + "Epoch 0: 5%| | 745/14932 [01:15<23:54, 9.89it/s, v_num=o30c, train/loss=4.030" ] }, { @@ -7049,7 +7948,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 44%[=======> ] 725.83M 6.35MB/s eta 4m 15s " + "Epoch 0: 5%| | 746/14932 [01:15<23:53, 9.89it/s, v_num=o30c, train/loss=4.030\r", + "Epoch 0: 5%| | 746/14932 [01:15<23:53, 9.89it/s, v_num=o30c, train/loss=1.930" ] }, { @@ -7057,7 +7957,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 44%[=======> ] 727.33M 6.48MB/s eta 4m 15s " + "Epoch 0: 5%| | 747/14932 [01:15<23:54, 9.89it/s, v_num=o30c, train/loss=1.930\r", + "Epoch 0: 5%| | 747/14932 [01:15<23:54, 9.89it/s, v_num=o30c, train/loss=3.970" ] }, { @@ -7065,7 +7966,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 44%[=======> ] 728.85M 6.37MB/s eta 4m 15s " + "Epoch 0: 5%| | 748/14932 [01:15<23:53, 9.89it/s, v_num=o30c, train/loss=3.970\r", + "Epoch 0: 5%| | 748/14932 [01:15<23:53, 9.89it/s, v_num=o30c, train/loss=3.030" ] }, { @@ -7073,7 +7975,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 44%[=======> ] 730.36M 6.45MB/s eta 4m 15s " + "Epoch 0: 5%| | 749/14932 [01:15<23:53, 9.90it/s, v_num=o30c, train/loss=3.030\r", + "Epoch 0: 5%| | 749/14932 [01:15<23:53, 9.90it/s, v_num=o30c, train/loss=1.470" ] }, { @@ -7081,7 +7984,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 44%[=======> ] 731.88M 6.41MB/s eta 4m 12s " + "Epoch 0: 5%| | 750/14932 [01:15<23:52, 9.90it/s, v_num=o30c, train/loss=1.470\r", + "Epoch 0: 5%| | 750/14932 [01:15<23:52, 9.90it/s, v_num=o30c, train/loss=2.730" ] }, { @@ -7089,7 +7993,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 44%[=======> ] 733.41M 6.47MB/s eta 4m 12s " + "Epoch 0: 5%| | 751/14932 [01:15<23:51, 9.90it/s, v_num=o30c, train/loss=2.730\r", + "Epoch 0: 5%| | 751/14932 [01:15<23:51, 9.90it/s, v_num=o30c, train/loss=2.380" ] }, { @@ -7097,7 +8002,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 44%[=======> ] 734.94M 6.38MB/s eta 4m 12s " + "Epoch 0: 5%| | 752/14932 [01:15<23:51, 9.91it/s, v_num=o30c, train/loss=2.380\r", + "Epoch 0: 5%| | 752/14932 [01:15<23:51, 9.91it/s, v_num=o30c, train/loss=4.440" ] }, { @@ -7105,7 +8011,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 44%[=======> ] 736.47M 6.49MB/s eta 4m 12s " + "Epoch 0: 5%| | 753/14932 [01:15<23:50, 9.91it/s, v_num=o30c, train/loss=4.440\r", + "Epoch 0: 5%| | 753/14932 [01:15<23:50, 9.91it/s, v_num=o30c, train/loss=2.120" ] }, { @@ -7113,7 +8020,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 44%[=======> ] 738.02M 6.28MB/s eta 4m 12s " + "Epoch 0: 5%| | 754/14932 [01:16<23:50, 9.91it/s, v_num=o30c, train/loss=2.120\r", + "Epoch 0: 5%| | 754/14932 [01:16<23:50, 9.91it/s, v_num=o30c, train/loss=3.950" ] }, { @@ -7121,7 +8029,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 45%[========> ] 739.57M 6.52MB/s eta 4m 9s " + "Epoch 0: 5%| | 755/14932 [01:16<23:49, 9.92it/s, v_num=o30c, train/loss=3.950\r", + "Epoch 0: 5%| | 755/14932 [01:16<23:49, 9.92it/s, v_num=o30c, train/loss=3.110" ] }, { @@ -7129,7 +8038,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 45%[========> ] 741.13M 6.31MB/s eta 4m 9s " + "Epoch 0: 5%| | 756/14932 [01:16<23:48, 9.92it/s, v_num=o30c, train/loss=3.110\r", + "Epoch 0: 5%| | 756/14932 [01:16<23:48, 9.92it/s, v_num=o30c, train/loss=3.530" ] }, { @@ -7137,7 +8047,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 45%[========> ] 742.71M 6.57MB/s eta 4m 9s " + "Epoch 0: 5%| | 757/14932 [01:16<23:48, 9.93it/s, v_num=o30c, train/loss=3.530\r", + "Epoch 0: 5%| | 757/14932 [01:16<23:48, 9.93it/s, v_num=o30c, train/loss=3.300" ] }, { @@ -7145,7 +8056,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 45%[========> ] 744.29M 6.36MB/s eta 4m 9s " + "Epoch 0: 5%| | 758/14932 [01:16<23:47, 9.93it/s, v_num=o30c, train/loss=3.300\r", + "Epoch 0: 5%| | 758/14932 [01:16<23:47, 9.93it/s, v_num=o30c, train/loss=3.920" ] }, { @@ -7153,7 +8065,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 45%[========> ] 745.88M 6.62MB/s eta 4m 9s " + "Epoch 0: 5%| | 759/14932 [01:16<23:46, 9.93it/s, v_num=o30c, train/loss=3.920\r", + "Epoch 0: 5%| | 759/14932 [01:16<23:46, 9.93it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -7161,7 +8074,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 45%[========> ] 747.47M 6.59MB/s eta 4m 5s " + "Epoch 0: 5%| | 760/14932 [01:16<23:46, 9.94it/s, v_num=o30c, train/loss=3.480\r", + "Epoch 0: 5%| | 760/14932 [01:16<23:46, 9.94it/s, v_num=o30c, train/loss=3.250" ] }, { @@ -7169,7 +8083,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 45%[========> ] 749.10M 6.73MB/s eta 4m 5s " + "Epoch 0: 5%| | 761/14932 [01:16<23:45, 9.94it/s, v_num=o30c, train/loss=3.250\r", + "Epoch 0: 5%| | 761/14932 [01:16<23:45, 9.94it/s, v_num=o30c, train/loss=2.610" ] }, { @@ -7177,7 +8092,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 45%[========> ] 750.74M 6.62MB/s eta 4m 5s " + "Epoch 0: 5%| | 762/14932 [01:16<23:44, 9.94it/s, v_num=o30c, train/loss=2.610\r", + "Epoch 0: 5%| | 762/14932 [01:16<23:44, 9.94it/s, v_num=o30c, train/loss=4.280" ] }, { @@ -7185,7 +8101,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 45%[========> ] 752.35M 6.19MB/s eta 4m 5s " + "Epoch 0: 5%| | 763/14932 [01:16<23:44, 9.95it/s, v_num=o30c, train/loss=4.280\r", + "Epoch 0: 5%| | 763/14932 [01:16<23:44, 9.95it/s, v_num=o30c, train/loss=3.170" ] }, { @@ -7193,7 +8110,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 46%[========> ] 755.19M 6.50MB/s eta 4m 2s " + "Epoch 0: 5%| | 764/14932 [01:16<23:43, 9.95it/s, v_num=o30c, train/loss=3.170\r", + "Epoch 0: 5%| | 764/14932 [01:16<23:43, 9.95it/s, v_num=o30c, train/loss=2.690" ] }, { @@ -7201,7 +8119,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 46%[========> ] 756.39M 6.63MB/s eta 4m 2s " + "Epoch 0: 5%| | 765/14932 [01:16<23:42, 9.96it/s, v_num=o30c, train/loss=2.690\r", + "Epoch 0: 5%| | 765/14932 [01:16<23:42, 9.96it/s, v_num=o30c, train/loss=2.340" ] }, { @@ -7209,7 +8128,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 46%[========> ] 757.63M 6.39MB/s eta 4m 2s " + "Epoch 0: 5%| | 766/14932 [01:16<23:42, 9.96it/s, v_num=o30c, train/loss=2.340\r", + "Epoch 0: 5%| | 766/14932 [01:16<23:42, 9.96it/s, v_num=o30c, train/loss=3.470" ] }, { @@ -7217,7 +8137,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 46%[========> ] 758.88M 6.54MB/s eta 4m 2s " + "Epoch 0: 5%| | 767/14932 [01:16<23:41, 9.96it/s, v_num=o30c, train/loss=3.470\r", + "Epoch 0: 5%| | 767/14932 [01:16<23:41, 9.96it/s, v_num=o30c, train/loss=1.960" ] }, { @@ -7225,7 +8146,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 46%[========> ] 760.18M 6.27MB/s eta 4m 2s " + "Epoch 0: 5%| | 768/14932 [01:17<23:43, 9.95it/s, v_num=o30c, train/loss=1.960\r", + "Epoch 0: 5%| | 768/14932 [01:17<23:43, 9.95it/s, v_num=o30c, train/loss=2.750" ] }, { @@ -7233,7 +8155,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 46%[========> ] 760.68M 6.25MB/s eta 4m 0s " + "Epoch 0: 5%| | 769/14932 [01:17<23:42, 9.96it/s, v_num=o30c, train/loss=2.750\r", + "Epoch 0: 5%| | 769/14932 [01:17<23:42, 9.96it/s, v_num=o30c, train/loss=3.910" ] }, { @@ -7241,7 +8164,8 @@ "output_type": "stream", "text": [ "\r", - "pth 46%[========> ] 761.79M 6.10MB/s eta 4m 0s " + "Epoch 0: 5%| | 770/14932 [01:17<23:41, 9.96it/s, v_num=o30c, train/loss=3.910\r", + "Epoch 0: 5%| | 770/14932 [01:17<23:41, 9.96it/s, v_num=o30c, train/loss=2.770" ] }, { @@ -7249,7 +8173,8 @@ "output_type": "stream", "text": [ "\r", - "th 46%[========> ] 763.13M 6.12MB/s eta 4m 0s " + "Epoch 0: 5%| | 771/14932 [01:17<23:41, 9.96it/s, v_num=o30c, train/loss=2.770\r", + "Epoch 0: 5%| | 771/14932 [01:17<23:41, 9.96it/s, v_num=o30c, train/loss=2.920" ] }, { @@ -7257,7 +8182,8 @@ "output_type": "stream", "text": [ "\r", - "h 46%[========> ] 764.52M 5.98MB/s eta 4m 0s " + "Epoch 0: 5%| | 772/14932 [01:17<23:40, 9.97it/s, v_num=o30c, train/loss=2.920\r", + "Epoch 0: 5%| | 772/14932 [01:17<23:40, 9.97it/s, v_num=o30c, train/loss=2.270" ] }, { @@ -7265,7 +8191,8 @@ "output_type": "stream", "text": [ "\r", - " 46%[========> ] 765.93M 6.00MB/s eta 4m 0s " + "Epoch 0: 5%| | 773/14932 [01:17<23:40, 9.97it/s, v_num=o30c, train/loss=2.270\r", + "Epoch 0: 5%| | 773/14932 [01:17<23:40, 9.97it/s, v_num=o30c, train/loss=3.730" ] }, { @@ -7273,7 +8200,8 @@ "output_type": "stream", "text": [ "\r", - " v 46%[========> ] 767.36M 5.88MB/s eta 3m 58s " + "Epoch 0: 5%| | 774/14932 [01:17<23:39, 9.97it/s, v_num=o30c, train/loss=3.730\r", + "Epoch 0: 5%| | 774/14932 [01:17<23:39, 9.97it/s, v_num=o30c, train/loss=3.550" ] }, { @@ -7281,7 +8209,8 @@ "output_type": "stream", "text": [ "\r", - " v5 46%[========> ] 768.83M 5.89MB/s eta 3m 58s " + "Epoch 0: 5%| | 775/14932 [01:17<23:39, 9.98it/s, v_num=o30c, train/loss=3.550\r", + "Epoch 0: 5%| | 775/14932 [01:17<23:39, 9.98it/s, v_num=o30c, train/loss=2.280" ] }, { @@ -7289,7 +8218,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 46%[========> ] 770.30M 5.78MB/s eta 3m 58s " + "Epoch 0: 5%| | 776/14932 [01:17<23:38, 9.98it/s, v_num=o30c, train/loss=2.280\r", + "Epoch 0: 5%| | 776/14932 [01:17<23:38, 9.98it/s, v_num=o30c, train/loss=3.390" ] }, { @@ -7297,7 +8227,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 47%[========> ] 771.80M 6.18MB/s eta 3m 58s " + "Epoch 0: 5%| | 777/14932 [01:17<23:37, 9.98it/s, v_num=o30c, train/loss=3.390\r", + "Epoch 0: 5%| | 777/14932 [01:17<23:37, 9.98it/s, v_num=o30c, train/loss=4.060" ] }, { @@ -7305,7 +8236,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 47%[========> ] 773.32M 5.65MB/s eta 3m 58s " + "Epoch 0: 5%| | 778/14932 [01:17<23:37, 9.98it/s, v_num=o30c, train/loss=4.060\r", + "Epoch 0: 5%| | 778/14932 [01:17<23:37, 9.98it/s, v_num=o30c, train/loss=4.250" ] }, { @@ -7313,7 +8245,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 47%[========> ] 774.83M 5.86MB/s eta 3m 55s " + "Epoch 0: 5%| | 779/14932 [01:17<23:37, 9.99it/s, v_num=o30c, train/loss=4.250\r", + "Epoch 0: 5%| | 779/14932 [01:17<23:37, 9.99it/s, v_num=o30c, train/loss=2.860" ] }, { @@ -7321,7 +8254,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 47%[========> ] 776.38M 5.82MB/s eta 3m 55s " + "Epoch 0: 5%| | 780/14932 [01:18<23:36, 9.99it/s, v_num=o30c, train/loss=2.860\r", + "Epoch 0: 5%| | 780/14932 [01:18<23:36, 9.99it/s, v_num=o30c, train/loss=3.620" ] }, { @@ -7329,7 +8263,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 47%[========> ] 777.94M 6.04MB/s eta 3m 55s " + "Epoch 0: 5%| | 781/14932 [01:18<23:35, 9.99it/s, v_num=o30c, train/loss=3.620\r", + "Epoch 0: 5%| | 781/14932 [01:18<23:35, 9.99it/s, v_num=o30c, train/loss=1.730" ] }, { @@ -7337,7 +8272,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 47%[========> ] 779.52M 5.99MB/s eta 3m 55s " + "Epoch 0: 5%| | 782/14932 [01:18<23:35, 10.00it/s, v_num=o30c, train/loss=1.730\r", + "Epoch 0: 5%| | 782/14932 [01:18<23:35, 10.00it/s, v_num=o30c, train/loss=2.340" ] }, { @@ -7345,7 +8281,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 47%[========> ] 781.10M 6.21MB/s eta 3m 55s " + "Epoch 0: 5%| | 783/14932 [01:18<23:34, 10.00it/s, v_num=o30c, train/loss=2.340\r", + "Epoch 0: 5%| | 783/14932 [01:18<23:34, 10.00it/s, v_num=o30c, train/loss=3.300" ] }, { @@ -7353,7 +8290,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 47%[========> ] 782.68M 6.14MB/s eta 3m 51s " + "Epoch 0: 5%| | 784/14932 [01:18<23:33, 10.01it/s, v_num=o30c, train/loss=3.300\r", + "Epoch 0: 5%| | 784/14932 [01:18<23:33, 10.01it/s, v_num=o30c, train/loss=4.380" ] }, { @@ -7361,7 +8299,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 47%[========> ] 784.29M 6.34MB/s eta 3m 51s " + "Epoch 0: 5%| | 785/14932 [01:18<23:33, 10.01it/s, v_num=o30c, train/loss=4.380\r", + "Epoch 0: 5%| | 785/14932 [01:18<23:33, 10.01it/s, v_num=o30c, train/loss=3.860" ] }, { @@ -7369,7 +8308,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 47%[========> ] 785.89M 6.27MB/s eta 3m 51s " + "Epoch 0: 5%| | 786/14932 [01:18<23:32, 10.01it/s, v_num=o30c, train/loss=3.860\r", + "Epoch 0: 5%| | 786/14932 [01:18<23:32, 10.01it/s, v_num=o30c, train/loss=3.550" ] }, { @@ -7377,7 +8317,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 47%[========> ] 787.52M 6.47MB/s eta 3m 51s " + "Epoch 0: 5%| | 787/14932 [01:18<23:32, 10.02it/s, v_num=o30c, train/loss=3.550\r", + "Epoch 0: 5%| | 787/14932 [01:18<23:32, 10.02it/s, v_num=o30c, train/loss=3.300" ] }, { @@ -7385,7 +8326,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 48%[========> ] 789.14M 6.38MB/s eta 3m 51s " + "Epoch 0: 5%| | 788/14932 [01:18<23:31, 10.02it/s, v_num=o30c, train/loss=3.300\r", + "Epoch 0: 5%| | 788/14932 [01:18<23:31, 10.02it/s, v_num=o30c, train/loss=4.120" ] }, { @@ -7393,7 +8335,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 48%[========> ] 790.77M 6.57MB/s eta 3m 48s " + "Epoch 0: 5%| | 789/14932 [01:18<23:31, 10.02it/s, v_num=o30c, train/loss=4.120\r", + "Epoch 0: 5%| | 789/14932 [01:18<23:31, 10.02it/s, v_num=o30c, train/loss=2.560" ] }, { @@ -7401,7 +8344,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 48%[========> ] 792.41M 6.47MB/s eta 3m 48s " + "Epoch 0: 5%| | 790/14932 [01:18<23:30, 10.03it/s, v_num=o30c, train/loss=2.560\r", + "Epoch 0: 5%| | 790/14932 [01:18<23:30, 10.03it/s, v_num=o30c, train/loss=3.120" ] }, { @@ -7409,7 +8353,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 48%[========> ] 794.05M 6.67MB/s eta 3m 48s " + "Epoch 0: 5%| | 791/14932 [01:18<23:29, 10.03it/s, v_num=o30c, train/loss=3.120\r", + "Epoch 0: 5%| | 791/14932 [01:18<23:29, 10.03it/s, v_num=o30c, train/loss=3.550" ] }, { @@ -7417,7 +8362,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 48%[========> ] 795.71M 6.55MB/s eta 3m 48s " + "Epoch 0: 5%| | 792/14932 [01:18<23:30, 10.03it/s, v_num=o30c, train/loss=3.550\r", + "Epoch 0: 5%| | 792/14932 [01:18<23:30, 10.03it/s, v_num=o30c, train/loss=3.580" ] }, { @@ -7425,7 +8371,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 48%[========> ] 797.36M 6.74MB/s eta 3m 48s " + "Epoch 0: 5%| | 793/14932 [01:19<23:29, 10.03it/s, v_num=o30c, train/loss=3.580\r", + "Epoch 0: 5%| | 793/14932 [01:19<23:29, 10.03it/s, v_num=o30c, train/loss=2.200" ] }, { @@ -7433,7 +8380,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 48%[========> ] 799.02M 6.62MB/s eta 3m 45s " + "Epoch 0: 5%| | 794/14932 [01:19<23:29, 10.03it/s, v_num=o30c, train/loss=2.200\r", + "Epoch 0: 5%| | 794/14932 [01:19<23:29, 10.03it/s, v_num=o30c, train/loss=3.450" ] }, { @@ -7441,7 +8389,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 48%[========> ] 800.68M 6.80MB/s eta 3m 45s " + "Epoch 0: 5%| | 795/14932 [01:19<23:35, 9.98it/s, v_num=o30c, train/loss=3.450\r", + "Epoch 0: 5%| | 795/14932 [01:19<23:35, 9.98it/s, v_num=o30c, train/loss=3.810" ] }, { @@ -7449,7 +8398,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 48%[========> ] 802.35M 6.68MB/s eta 3m 45s " + "Epoch 0: 5%| | 796/14932 [01:19<23:35, 9.99it/s, v_num=o30c, train/loss=3.810\r", + "Epoch 0: 5%| | 796/14932 [01:19<23:35, 9.99it/s, v_num=o30c, train/loss=2.660" ] }, { @@ -7457,7 +8407,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 48%[========> ] 804.00M 6.95MB/s eta 3m 45s " + "Epoch 0: 5%| | 797/14932 [01:19<23:34, 9.99it/s, v_num=o30c, train/loss=2.660\r", + "Epoch 0: 5%| | 797/14932 [01:19<23:34, 9.99it/s, v_num=o30c, train/loss=3.700" ] }, { @@ -7465,7 +8416,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 49%[========> ] 805.68M 6.70MB/s eta 3m 45s " + "Epoch 0: 5%| | 798/14932 [01:19<23:34, 10.00it/s, v_num=o30c, train/loss=3.700\r", + "Epoch 0: 5%| | 798/14932 [01:19<23:34, 10.00it/s, v_num=o30c, train/loss=3.090" ] }, { @@ -7473,7 +8425,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 49%[========> ] 807.35M 6.89MB/s eta 3m 42s " + "Epoch 0: 5%| | 799/14932 [01:19<23:33, 10.00it/s, v_num=o30c, train/loss=3.090\r", + "Epoch 0: 5%| | 799/14932 [01:19<23:33, 10.00it/s, v_num=o30c, train/loss=4.910" ] }, { @@ -7481,7 +8434,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 49%[========> ] 809.00M 6.76MB/s eta 3m 42s " + "Epoch 0: 5%| | 800/14932 [01:20<23:35, 9.99it/s, v_num=o30c, train/loss=4.910\r", + "Epoch 0: 5%| | 800/14932 [01:20<23:35, 9.99it/s, v_num=o30c, train/loss=3.050" ] }, { @@ -7489,7 +8443,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 49%[========> ] 810.68M 7.03MB/s eta 3m 42s " + "Epoch 0: 5%| | 801/14932 [01:20<23:34, 9.99it/s, v_num=o30c, train/loss=3.050\r", + "Epoch 0: 5%| | 801/14932 [01:20<23:34, 9.99it/s, v_num=o30c, train/loss=2.920" ] }, { @@ -7497,7 +8452,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 49%[========> ] 812.35M 6.79MB/s eta 3m 42s " + "Epoch 0: 5%| | 802/14932 [01:20<23:33, 9.99it/s, v_num=o30c, train/loss=2.920\r", + "Epoch 0: 5%| | 802/14932 [01:20<23:33, 9.99it/s, v_num=o30c, train/loss=2.980" ] }, { @@ -7505,7 +8461,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 49%[========> ] 814.00M 7.07MB/s eta 3m 42s " + "Epoch 0: 5%| | 803/14932 [01:20<23:33, 10.00it/s, v_num=o30c, train/loss=2.980\r", + "Epoch 0: 5%| | 803/14932 [01:20<23:33, 10.00it/s, v_num=o30c, train/loss=2.470" ] }, { @@ -7513,7 +8470,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 49%[========> ] 814.54M 6.72MB/s eta 3m 39s " + "Epoch 0: 5%| | 804/14932 [01:20<23:32, 10.00it/s, v_num=o30c, train/loss=2.470\r", + "Epoch 0: 5%| | 804/14932 [01:20<23:32, 10.00it/s, v_num=o30c, train/loss=3.660" ] }, { @@ -7521,7 +8479,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 49%[========> ] 817.08M 7.15MB/s eta 3m 39s " + "Epoch 0: 5%| | 805/14932 [01:20<23:31, 10.01it/s, v_num=o30c, train/loss=3.660\r", + "Epoch 0: 5%| | 805/14932 [01:20<23:31, 10.01it/s, v_num=o30c, train/loss=2.780" ] }, { @@ -7529,7 +8488,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 49%[========> ] 818.72M 7.10MB/s eta 3m 39s " + "Epoch 0: 5%| | 806/14932 [01:20<23:31, 10.01it/s, v_num=o30c, train/loss=2.780\r", + "Epoch 0: 5%| | 806/14932 [01:20<23:31, 10.01it/s, v_num=o30c, train/loss=4.090" ] }, { @@ -7537,7 +8497,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 49%[========> ] 820.38M 6.99MB/s eta 3m 39s " + "Epoch 0: 5%| | 807/14932 [01:20<23:30, 10.01it/s, v_num=o30c, train/loss=4.090\r", + "Epoch 0: 5%| | 807/14932 [01:20<23:30, 10.01it/s, v_num=o30c, train/loss=4.160" ] }, { @@ -7545,7 +8506,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 50%[=========> ] 822.04M 7.17MB/s eta 3m 39s " + "Epoch 0: 5%| | 808/14932 [01:20<23:30, 10.02it/s, v_num=o30c, train/loss=4.160\r", + "Epoch 0: 5%| | 808/14932 [01:20<23:30, 10.02it/s, v_num=o30c, train/loss=3.970" ] }, { @@ -7553,7 +8515,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 50%[=========> ] 823.64M 7.06MB/s eta 3m 35s " + "Epoch 0: 5%| | 809/14932 [01:20<23:29, 10.02it/s, v_num=o30c, train/loss=3.970\r", + "Epoch 0: 5%| | 809/14932 [01:20<23:29, 10.02it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -7561,7 +8524,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 50%[=========> ] 824.32M 7.09MB/s eta 3m 35s " + "Epoch 0: 5%| | 810/14932 [01:20<23:28, 10.02it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 5%| | 810/14932 [01:20<23:28, 10.02it/s, v_num=o30c, train/loss=3.330" ] }, { @@ -7569,7 +8533,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 50%[=========> ] 825.69M 6.85MB/s eta 3m 35s " + "Epoch 0: 5%| | 811/14932 [01:20<23:28, 10.03it/s, v_num=o30c, train/loss=3.330\r", + "Epoch 0: 5%| | 811/14932 [01:20<23:28, 10.03it/s, v_num=o30c, train/loss=3.190" ] }, { @@ -7577,7 +8542,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 50%[=========> ] 827.36M 7.08MB/s eta 3m 35s " + "Epoch 0: 5%| | 812/14932 [01:20<23:27, 10.03it/s, v_num=o30c, train/loss=3.190\r", + "Epoch 0: 5%| | 812/14932 [01:20<23:27, 10.03it/s, v_num=o30c, train/loss=3.120" ] }, { @@ -7585,7 +8551,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 50%[=========> ] 829.02M 6.86MB/s eta 3m 35s " + "Epoch 0: 5%| | 813/14932 [01:21<23:26, 10.04it/s, v_num=o30c, train/loss=3.120\r", + "Epoch 0: 5%| | 813/14932 [01:21<23:26, 10.04it/s, v_num=o30c, train/loss=3.730" ] }, { @@ -7593,7 +8560,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 50%[=========> ] 830.69M 7.08MB/s eta 3m 33s " + "Epoch 0: 5%| | 814/14932 [01:21<23:26, 10.04it/s, v_num=o30c, train/loss=3.730\r", + "Epoch 0: 5%| | 814/14932 [01:21<23:26, 10.04it/s, v_num=o30c, train/loss=3.780" ] }, { @@ -7601,7 +8569,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 50%[=========> ] 832.36M 6.86MB/s eta 3m 33s " + "Epoch 0: 5%| | 815/14932 [01:21<23:25, 10.04it/s, v_num=o30c, train/loss=3.780\r", + "Epoch 0: 5%| | 815/14932 [01:21<23:25, 10.04it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -7609,7 +8578,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 50%[=========> ] 834.05M 7.08MB/s eta 3m 33s " + "Epoch 0: 5%| | 816/14932 [01:21<23:25, 10.04it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 5%| | 816/14932 [01:21<23:25, 10.04it/s, v_num=o30c, train/loss=2.090" ] }, { @@ -7617,7 +8587,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 50%[=========> ] 835.72M 6.83MB/s eta 3m 33s " + "Epoch 0: 5%| | 817/14932 [01:21<23:24, 10.05it/s, v_num=o30c, train/loss=2.090\r", + "Epoch 0: 5%| | 817/14932 [01:21<23:24, 10.05it/s, v_num=o30c, train/loss=3.220" ] }, { @@ -7625,7 +8596,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 51%[=========> ] 837.39M 7.21MB/s eta 3m 33s " + "Epoch 0: 5%| | 818/14932 [01:21<23:24, 10.05it/s, v_num=o30c, train/loss=3.220\r", + "Epoch 0: 5%| | 818/14932 [01:21<23:24, 10.05it/s, v_num=o30c, train/loss=3.830" ] }, { @@ -7633,7 +8605,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 51%[=========> ] 838.24M 6.73MB/s eta 3m 30s " + "Epoch 0: 5%| | 819/14932 [01:21<23:23, 10.05it/s, v_num=o30c, train/loss=3.830\r", + "Epoch 0: 5%| | 819/14932 [01:21<23:23, 10.05it/s, v_num=o30c, train/loss=3.810" ] }, { @@ -7641,7 +8614,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 51%[=========> ] 841.22M 6.89MB/s eta 3m 30s " + "Epoch 0: 5%| | 820/14932 [01:21<23:23, 10.06it/s, v_num=o30c, train/loss=3.810\r", + "Epoch 0: 5%| | 820/14932 [01:21<23:23, 10.06it/s, v_num=o30c, train/loss=2.640" ] }, { @@ -7649,7 +8623,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 51%[=========> ] 842.38M 6.83MB/s eta 3m 30s " + "Epoch 0: 5%| | 821/14932 [01:21<23:23, 10.05it/s, v_num=o30c, train/loss=2.640\r", + "Epoch 0: 5%| | 821/14932 [01:21<23:23, 10.05it/s, v_num=o30c, train/loss=3.020" ] }, { @@ -7657,7 +8632,8 @@ "output_type": "stream", "text": [ "\r", - "pth 51%[=========> ] 842.83M 6.67MB/s eta 3m 30s " + "Epoch 0: 6%| | 822/14932 [01:21<23:22, 10.06it/s, v_num=o30c, train/loss=3.020\r", + "Epoch 0: 6%| | 822/14932 [01:21<23:23, 10.06it/s, v_num=o30c, train/loss=3.620" ] }, { @@ -7665,7 +8641,8 @@ "output_type": "stream", "text": [ "\r", - "th 51%[=========> ] 844.00M 6.36MB/s eta 3m 30s " + "Epoch 0: 6%| | 823/14932 [01:21<23:22, 10.06it/s, v_num=o30c, train/loss=3.620\r", + "Epoch 0: 6%| | 823/14932 [01:21<23:22, 10.06it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -7673,7 +8650,8 @@ "output_type": "stream", "text": [ "\r", - "h 51%[=========> ] 845.29M 6.49MB/s eta 3m 27s " + "Epoch 0: 6%| | 824/14932 [01:21<23:22, 10.06it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 6%| | 824/14932 [01:21<23:22, 10.06it/s, v_num=o30c, train/loss=2.840" ] }, { @@ -7681,7 +8659,8 @@ "output_type": "stream", "text": [ "\r", - " 51%[=========> ] 846.60M 6.35MB/s eta 3m 27s " + "Epoch 0: 6%| | 825/14932 [01:21<23:21, 10.07it/s, v_num=o30c, train/loss=2.840\r", + "Epoch 0: 6%| | 825/14932 [01:21<23:21, 10.07it/s, v_num=o30c, train/loss=1.330" ] }, { @@ -7689,7 +8668,8 @@ "output_type": "stream", "text": [ "\r", - " v 51%[=========> ] 847.93M 6.37MB/s eta 3m 27s " + "Epoch 0: 6%| | 826/14932 [01:22<23:20, 10.07it/s, v_num=o30c, train/loss=1.330\r", + "Epoch 0: 6%| | 826/14932 [01:22<23:20, 10.07it/s, v_num=o30c, train/loss=1.800" ] }, { @@ -7697,7 +8677,8 @@ "output_type": "stream", "text": [ "\r", - " v5 51%[=========> ] 849.13M 6.23MB/s eta 3m 27s " + "Epoch 0: 6%| | 827/14932 [01:22<23:20, 10.07it/s, v_num=o30c, train/loss=1.800\r", + "Epoch 0: 6%| | 827/14932 [01:22<23:20, 10.07it/s, v_num=o30c, train/loss=4.030" ] }, { @@ -7705,7 +8686,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 51%[=========> ] 850.38M 6.04MB/s eta 3m 27s " + "Epoch 0: 6%| | 828/14932 [01:22<23:20, 10.07it/s, v_num=o30c, train/loss=4.030\r", + "Epoch 0: 6%| | 828/14932 [01:22<23:20, 10.07it/s, v_num=o30c, train/loss=3.450" ] }, { @@ -7713,7 +8695,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 51%[=========> ] 851.77M 6.13MB/s eta 3m 25s " + "Epoch 0: 6%| | 829/14932 [01:22<23:20, 10.07it/s, v_num=o30c, train/loss=3.450\r", + "Epoch 0: 6%| | 829/14932 [01:22<23:20, 10.07it/s, v_num=o30c, train/loss=2.050" ] }, { @@ -7721,7 +8704,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 51%[=========> ] 853.16M 5.99MB/s eta 3m 25s " + "Epoch 0: 6%| | 830/14932 [01:22<23:19, 10.07it/s, v_num=o30c, train/loss=2.050\r", + "Epoch 0: 6%| | 830/14932 [01:22<23:19, 10.07it/s, v_num=o30c, train/loss=3.580" ] }, { @@ -7729,7 +8713,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 52%[=========> ] 853.86M 6.02MB/s eta 3m 25s " + "Epoch 0: 6%| | 831/14932 [01:22<23:19, 10.08it/s, v_num=o30c, train/loss=3.580\r", + "Epoch 0: 6%| | 831/14932 [01:22<23:19, 10.08it/s, v_num=o30c, train/loss=3.780" ] }, { @@ -7737,7 +8722,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 52%[=========> ] 855.08M 5.68MB/s eta 3m 25s " + "Epoch 0: 6%| | 832/14932 [01:22<23:20, 10.06it/s, v_num=o30c, train/loss=3.780\r", + "Epoch 0: 6%| | 832/14932 [01:22<23:20, 10.06it/s, v_num=o30c, train/loss=2.170" ] }, { @@ -7745,7 +8731,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 52%[=========> ] 856.54M 5.92MB/s eta 3m 25s " + "Epoch 0: 6%| | 833/14932 [01:22<23:20, 10.07it/s, v_num=o30c, train/loss=2.170\r", + "Epoch 0: 6%| | 833/14932 [01:22<23:20, 10.07it/s, v_num=o30c, train/loss=2.950" ] }, { @@ -7753,7 +8740,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 52%[=========> ] 858.02M 5.85MB/s eta 3m 23s " + "Epoch 0: 6%| | 834/14932 [01:22<23:19, 10.07it/s, v_num=o30c, train/loss=2.950\r", + "Epoch 0: 6%| | 834/14932 [01:22<23:19, 10.07it/s, v_num=o30c, train/loss=3.230" ] }, { @@ -7761,7 +8749,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 52%[=========> ] 859.32M 5.82MB/s eta 3m 23s " + "Epoch 0: 6%| | 835/14932 [01:22<23:19, 10.07it/s, v_num=o30c, train/loss=3.230\r", + "Epoch 0: 6%| | 835/14932 [01:22<23:19, 10.07it/s, v_num=o30c, train/loss=2.940" ] }, { @@ -7769,7 +8758,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 52%[=========> ] 860.72M 6.01MB/s eta 3m 23s " + "Epoch 0: 6%| | 836/14932 [01:22<23:18, 10.08it/s, v_num=o30c, train/loss=2.940\r", + "Epoch 0: 6%| | 836/14932 [01:22<23:18, 10.08it/s, v_num=o30c, train/loss=3.500" ] }, { @@ -7777,7 +8767,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 52%[=========> ] 862.19M 5.54MB/s eta 3m 23s " + "Epoch 0: 6%| | 837/14932 [01:23<23:18, 10.08it/s, v_num=o30c, train/loss=3.500\r", + "Epoch 0: 6%| | 837/14932 [01:23<23:18, 10.08it/s, v_num=o30c, train/loss=2.670" ] }, { @@ -7785,7 +8776,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 52%[=========> ] 864.88M 6.15MB/s eta 3m 21s " + "Epoch 0: 6%| | 838/14932 [01:23<23:17, 10.08it/s, v_num=o30c, train/loss=2.670\r", + "Epoch 0: 6%| | 838/14932 [01:23<23:17, 10.08it/s, v_num=o30c, train/loss=3.000" ] }, { @@ -7793,7 +8785,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 52%[=========> ] 865.24M 5.38MB/s eta 3m 21s " + "Epoch 0: 6%| | 839/14932 [01:23<23:17, 10.09it/s, v_num=o30c, train/loss=3.000\r", + "Epoch 0: 6%| | 839/14932 [01:23<23:17, 10.09it/s, v_num=o30c, train/loss=4.340" ] }, { @@ -7801,7 +8794,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 52%[=========> ] 867.52M 5.85MB/s eta 3m 21s " + "Epoch 0: 6%| | 840/14932 [01:23<23:16, 10.09it/s, v_num=o30c, train/loss=4.340\r", + "Epoch 0: 6%| | 840/14932 [01:23<23:16, 10.09it/s, v_num=o30c, train/loss=2.980" ] }, { @@ -7809,7 +8803,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 52%[=========> ] 868.18M 5.61MB/s eta 3m 21s " + "Epoch 0: 6%| | 841/14932 [01:23<23:16, 10.09it/s, v_num=o30c, train/loss=2.980\r", + "Epoch 0: 6%| | 841/14932 [01:23<23:16, 10.09it/s, v_num=o30c, train/loss=3.620" ] }, { @@ -7817,7 +8812,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 52%[=========> ] 868.97M 5.56MB/s eta 3m 19s " + "Epoch 0: 6%| | 842/14932 [01:23<23:15, 10.09it/s, v_num=o30c, train/loss=3.620\r", + "Epoch 0: 6%| | 842/14932 [01:23<23:15, 10.09it/s, v_num=o30c, train/loss=3.860" ] }, { @@ -7825,7 +8821,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 52%[=========> ] 869.80M 5.36MB/s eta 3m 19s " + "Epoch 0: 6%| | 843/14932 [01:23<23:15, 10.10it/s, v_num=o30c, train/loss=3.860\r", + "Epoch 0: 6%| | 843/14932 [01:23<23:15, 10.10it/s, v_num=o30c, train/loss=4.470" ] }, { @@ -7833,7 +8830,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 53%[=========> ] 870.64M 5.18MB/s eta 3m 19s " + "Epoch 0: 6%| | 844/14932 [01:23<23:14, 10.10it/s, v_num=o30c, train/loss=4.470\r", + "Epoch 0: 6%| | 844/14932 [01:23<23:14, 10.10it/s, v_num=o30c, train/loss=2.800" ] }, { @@ -7841,7 +8839,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 53%[=========> ] 871.44M 5.10MB/s eta 3m 19s " + "Epoch 0: 6%| | 845/14932 [01:23<23:13, 10.11it/s, v_num=o30c, train/loss=2.800\r", + "Epoch 0: 6%| | 845/14932 [01:23<23:13, 10.11it/s, v_num=o30c, train/loss=3.920" ] }, { @@ -7849,7 +8848,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 53%[=========> ] 872.19M 4.89MB/s eta 3m 19s " + "Epoch 0: 6%| | 846/14932 [01:23<23:14, 10.10it/s, v_num=o30c, train/loss=3.920\r", + "Epoch 0: 6%| | 846/14932 [01:23<23:14, 10.10it/s, v_num=o30c, train/loss=4.280" ] }, { @@ -7857,7 +8857,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 53%[=========> ] 873.04M 4.94MB/s eta 3m 18s " + "Epoch 0: 6%| | 847/14932 [01:23<23:13, 10.10it/s, v_num=o30c, train/loss=4.280\r", + "Epoch 0: 6%| | 847/14932 [01:23<23:13, 10.10it/s, v_num=o30c, train/loss=3.170" ] }, { @@ -7865,7 +8866,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 53%[=========> ] 873.91M 4.62MB/s eta 3m 18s " + "Epoch 0: 6%| | 848/14932 [01:23<23:13, 10.11it/s, v_num=o30c, train/loss=3.170\r", + "Epoch 0: 6%| | 848/14932 [01:23<23:13, 10.11it/s, v_num=o30c, train/loss=3.670" ] }, { @@ -7873,7 +8875,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 53%[=========> ] 874.43M 4.58MB/s eta 3m 18s " + "Epoch 0: 6%| | 849/14932 [01:23<23:12, 10.11it/s, v_num=o30c, train/loss=3.670\r", + "Epoch 0: 6%| | 849/14932 [01:23<23:12, 10.11it/s, v_num=o30c, train/loss=4.340" ] }, { @@ -7881,7 +8884,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 53%[=========> ] 875.08M 4.35MB/s eta 3m 18s " + "Epoch 0: 6%| | 850/14932 [01:24<23:12, 10.11it/s, v_num=o30c, train/loss=4.340\r", + "Epoch 0: 6%| | 850/14932 [01:24<23:12, 10.11it/s, v_num=o30c, train/loss=3.140" ] }, { @@ -7889,7 +8893,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 53%[=========> ] 876.00M 4.20MB/s eta 3m 18s " + "Epoch 0: 6%| | 851/14932 [01:24<23:11, 10.12it/s, v_num=o30c, train/loss=3.140\r", + "Epoch 0: 6%| | 851/14932 [01:24<23:11, 10.12it/s, v_num=o30c, train/loss=3.200" ] }, { @@ -7897,7 +8902,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 53%[=========> ] 876.93M 4.32MB/s eta 3m 17s " + "Epoch 0: 6%| | 852/14932 [01:24<23:10, 10.12it/s, v_num=o30c, train/loss=3.200\r", + "Epoch 0: 6%| | 852/14932 [01:24<23:11, 10.12it/s, v_num=o30c, train/loss=3.560" ] }, { @@ -7905,7 +8911,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 53%[=========> ] 877.85M 3.78MB/s eta 3m 17s " + "Epoch 0: 6%| | 853/14932 [01:24<23:10, 10.12it/s, v_num=o30c, train/loss=3.560\r", + "Epoch 0: 6%| | 853/14932 [01:24<23:10, 10.12it/s, v_num=o30c, train/loss=3.980" ] }, { @@ -7913,7 +8920,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 53%[=========> ] 878.79M 4.20MB/s eta 3m 17s " + "Epoch 0: 6%| | 854/14932 [01:24<23:09, 10.13it/s, v_num=o30c, train/loss=3.980\r", + "Epoch 0: 6%| | 854/14932 [01:24<23:09, 10.13it/s, v_num=o30c, train/loss=3.700" ] }, { @@ -7921,7 +8929,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 53%[=========> ] 879.72M 3.75MB/s eta 3m 17s " + "Epoch 0: 6%| | 855/14932 [01:24<23:10, 10.12it/s, v_num=o30c, train/loss=3.700\r", + "Epoch 0: 6%| | 855/14932 [01:24<23:10, 10.12it/s, v_num=o30c, train/loss=3.120" ] }, { @@ -7929,7 +8938,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 53%[=========> ] 880.68M 3.71MB/s eta 3m 17s " + "Epoch 0: 6%| | 856/14932 [01:24<23:10, 10.12it/s, v_num=o30c, train/loss=3.120\r", + "Epoch 0: 6%| | 856/14932 [01:24<23:10, 10.12it/s, v_num=o30c, train/loss=2.730" ] }, { @@ -7937,7 +8947,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 53%[=========> ] 881.63M 3.84MB/s eta 3m 16s " + "Epoch 0: 6%| | 857/14932 [01:24<23:09, 10.13it/s, v_num=o30c, train/loss=2.730\r", + "Epoch 0: 6%| | 857/14932 [01:24<23:09, 10.13it/s, v_num=o30c, train/loss=1.780" ] }, { @@ -7945,7 +8956,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 53%[=========> ] 882.58M 3.48MB/s eta 3m 16s " + "Epoch 0: 6%| | 858/14932 [01:24<23:09, 10.13it/s, v_num=o30c, train/loss=1.780\r", + "Epoch 0: 6%| | 858/14932 [01:24<23:09, 10.13it/s, v_num=o30c, train/loss=1.110" ] }, { @@ -7953,7 +8965,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 53%[=========> ] 884.19M 3.56MB/s eta 3m 16s " + "Epoch 0: 6%| | 859/14932 [01:24<23:09, 10.13it/s, v_num=o30c, train/loss=1.110\r", + "Epoch 0: 6%| | 859/14932 [01:24<23:09, 10.13it/s, v_num=o30c, train/loss=3.330" ] }, { @@ -7961,7 +8974,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 53%[=========> ] 885.36M 3.60MB/s eta 3m 15s " + "Epoch 0: 6%| | 860/14932 [01:24<23:08, 10.13it/s, v_num=o30c, train/loss=3.330\r", + "Epoch 0: 6%| | 860/14932 [01:24<23:08, 10.13it/s, v_num=o30c, train/loss=4.220" ] }, { @@ -7969,7 +8983,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 53%[=========> ] 885.85M 3.49MB/s eta 3m 15s " + "Epoch 0: 6%| | 861/14932 [01:24<23:08, 10.14it/s, v_num=o30c, train/loss=4.220\r", + "Epoch 0: 6%| | 861/14932 [01:24<23:08, 10.14it/s, v_num=o30c, train/loss=1.650" ] }, { @@ -7977,7 +8992,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 53%[=========> ] 886.35M 3.43MB/s eta 3m 15s " + "Epoch 0: 6%| | 862/14932 [01:25<23:07, 10.14it/s, v_num=o30c, train/loss=1.650\r", + "Epoch 0: 6%| | 862/14932 [01:25<23:07, 10.14it/s, v_num=o30c, train/loss=4.340" ] }, { @@ -7985,7 +9001,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 54%[=========> ] 886.85M 3.32MB/s eta 3m 15s " + "Epoch 0: 6%| | 863/14932 [01:25<23:07, 10.14it/s, v_num=o30c, train/loss=4.340\r", + "Epoch 0: 6%| | 863/14932 [01:25<23:07, 10.14it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -7993,7 +9010,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 54%[=========> ] 887.36M 3.32MB/s eta 3m 15s " + "Epoch 0: 6%| | 864/14932 [01:25<23:08, 10.13it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 6%| | 864/14932 [01:25<23:08, 10.13it/s, v_num=o30c, train/loss=2.620" ] }, { @@ -8001,7 +9019,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 54%[=========> ] 887.86M 3.22MB/s eta 3m 15s " + "Epoch 0: 6%| | 865/14932 [01:25<23:07, 10.14it/s, v_num=o30c, train/loss=2.620\r", + "Epoch 0: 6%| | 865/14932 [01:25<23:07, 10.14it/s, v_num=o30c, train/loss=4.340" ] }, { @@ -8009,7 +9028,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 54%[=========> ] 888.33M 3.15MB/s eta 3m 15s " + "Epoch 0: 6%| | 866/14932 [01:25<23:07, 10.14it/s, v_num=o30c, train/loss=4.340\r", + "Epoch 0: 6%| | 866/14932 [01:25<23:07, 10.14it/s, v_num=o30c, train/loss=3.670" ] }, { @@ -8017,7 +9037,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 54%[=========> ] 888.85M 3.03MB/s eta 3m 15s " + "Epoch 0: 6%| | 867/14932 [01:25<23:06, 10.14it/s, v_num=o30c, train/loss=3.670\r", + "Epoch 0: 6%| | 867/14932 [01:25<23:06, 10.14it/s, v_num=o30c, train/loss=2.050" ] }, { @@ -8025,7 +9046,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 54%[=========> ] 889.39M 2.91MB/s eta 3m 15s " + "Epoch 0: 6%| | 868/14932 [01:25<23:05, 10.15it/s, v_num=o30c, train/loss=2.050\r", + "Epoch 0: 6%| | 868/14932 [01:25<23:05, 10.15it/s, v_num=o30c, train/loss=4.840" ] }, { @@ -8033,7 +9055,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 54%[=========> ] 889.94M 2.83MB/s eta 3m 15s " + "Epoch 0: 6%| | 869/14932 [01:25<23:05, 10.15it/s, v_num=o30c, train/loss=4.840\r", + "Epoch 0: 6%| | 869/14932 [01:25<23:05, 10.15it/s, v_num=o30c, train/loss=3.470" ] }, { @@ -8041,7 +9064,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 54%[=========> ] 890.50M 2.75MB/s eta 3m 15s " + "Epoch 0: 6%| | 870/14932 [01:25<23:04, 10.16it/s, v_num=o30c, train/loss=3.470\r", + "Epoch 0: 6%| | 870/14932 [01:25<23:04, 10.16it/s, v_num=o30c, train/loss=3.780" ] }, { @@ -8049,7 +9073,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 54%[=========> ] 890.72M 2.43MB/s eta 3m 15s " + "Epoch 0: 6%| | 871/14932 [01:25<23:06, 10.14it/s, v_num=o30c, train/loss=3.780\r", + "Epoch 0: 6%| | 871/14932 [01:25<23:06, 10.14it/s, v_num=o30c, train/loss=5.160" ] }, { @@ -8057,7 +9082,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 54%[=========> ] 891.96M 2.50MB/s eta 3m 15s " + "Epoch 0: 6%| | 872/14932 [01:25<23:05, 10.15it/s, v_num=o30c, train/loss=5.160\r", + "Epoch 0: 6%| | 872/14932 [01:25<23:05, 10.15it/s, v_num=o30c, train/loss=2.560" ] }, { @@ -8065,7 +9091,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 54%[=========> ] 892.36M 2.34MB/s eta 3m 15s " + "Epoch 0: 6%| | 873/14932 [01:25<23:04, 10.15it/s, v_num=o30c, train/loss=2.560\r", + "Epoch 0: 6%| | 873/14932 [01:26<23:04, 10.15it/s, v_num=o30c, train/loss=2.200" ] }, { @@ -8073,7 +9100,8 @@ "output_type": "stream", "text": [ "\r", - "pth 54%[=========> ] 892.79M 2.37MB/s eta 3m 15s " + "Epoch 0: 6%| | 874/14932 [01:26<23:04, 10.15it/s, v_num=o30c, train/loss=2.200\r", + "Epoch 0: 6%| | 874/14932 [01:26<23:04, 10.15it/s, v_num=o30c, train/loss=2.830" ] }, { @@ -8081,7 +9109,8 @@ "output_type": "stream", "text": [ "\r", - "th 54%[=========> ] 892.88M 1.96MB/s eta 3m 15s " + "Epoch 0: 6%| | 875/14932 [01:26<23:03, 10.16it/s, v_num=o30c, train/loss=2.830\r", + "Epoch 0: 6%| | 875/14932 [01:26<23:03, 10.16it/s, v_num=o30c, train/loss=4.310" ] }, { @@ -8089,7 +9118,8 @@ "output_type": "stream", "text": [ "\r", - "h 54%[=========> ] 893.61M 2.03MB/s eta 3m 15s " + "Epoch 0: 6%| | 876/14932 [01:26<23:03, 10.16it/s, v_num=o30c, train/loss=4.310\r", + "Epoch 0: 6%| | 876/14932 [01:26<23:03, 10.16it/s, v_num=o30c, train/loss=2.920" ] }, { @@ -8097,7 +9127,8 @@ "output_type": "stream", "text": [ "\r", - " 54%[=========> ] 893.91M 2.02MB/s eta 3m 15s " + "Epoch 0: 6%| | 877/14932 [01:26<23:03, 10.16it/s, v_num=o30c, train/loss=2.920\r", + "Epoch 0: 6%| | 877/14932 [01:26<23:03, 10.16it/s, v_num=o30c, train/loss=2.910" ] }, { @@ -8105,7 +9136,8 @@ "output_type": "stream", "text": [ "\r", - " v 54%[=========> ] 894.22M 1.93MB/s eta 3m 15s " + "Epoch 0: 6%| | 878/14932 [01:26<23:02, 10.16it/s, v_num=o30c, train/loss=2.910\r", + "Epoch 0: 6%| | 878/14932 [01:26<23:03, 10.16it/s, v_num=o30c, train/loss=2.860" ] }, { @@ -8113,7 +9145,8 @@ "output_type": "stream", "text": [ "\r", - " v5 54%[=========> ] 894.55M 1.88MB/s eta 3m 15s " + "Epoch 0: 6%| | 879/14932 [01:26<23:02, 10.17it/s, v_num=o30c, train/loss=2.860\r", + "Epoch 0: 6%| | 879/14932 [01:26<23:02, 10.17it/s, v_num=o30c, train/loss=4.340" ] }, { @@ -8121,7 +9154,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 54%[=========> ] 894.88M 1.85MB/s eta 3m 15s " + "Epoch 0: 6%| | 880/14932 [01:26<23:01, 10.17it/s, v_num=o30c, train/loss=4.340\r", + "Epoch 0: 6%| | 880/14932 [01:26<23:01, 10.17it/s, v_num=o30c, train/loss=3.770" ] }, { @@ -8129,7 +9163,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 54%[=========> ] 895.21M 1.78MB/s eta 3m 15s " + "Epoch 0: 6%| | 881/14932 [01:26<23:01, 10.17it/s, v_num=o30c, train/loss=3.770\r", + "Epoch 0: 6%| | 881/14932 [01:26<23:01, 10.17it/s, v_num=o30c, train/loss=2.730" ] }, { @@ -8137,7 +9172,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 54%[=========> ] 895.55M 1.74MB/s eta 3m 15s " + "Epoch 0: 6%| | 882/14932 [01:26<23:00, 10.18it/s, v_num=o30c, train/loss=2.730\r", + "Epoch 0: 6%| | 882/14932 [01:26<23:00, 10.18it/s, v_num=o30c, train/loss=2.090" ] }, { @@ -8145,7 +9181,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 54%[=========> ] 895.89M 1.67MB/s eta 3m 15s " + "Epoch 0: 6%| | 883/14932 [01:26<23:00, 10.18it/s, v_num=o30c, train/loss=2.090\r", + "Epoch 0: 6%| | 883/14932 [01:26<23:00, 10.18it/s, v_num=o30c, train/loss=3.780" ] }, { @@ -8153,7 +9190,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 54%[=========> ] 896.24M 1.64MB/s eta 3m 15s " + "Epoch 0: 6%| | 884/14932 [01:26<22:59, 10.18it/s, v_num=o30c, train/loss=3.780\r", + "Epoch 0: 6%| | 884/14932 [01:26<22:59, 10.18it/s, v_num=o30c, train/loss=1.790" ] }, { @@ -8161,7 +9199,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 54%[=========> ] 896.58M 1.57MB/s eta 3m 15s " + "Epoch 0: 6%| | 885/14932 [01:26<22:59, 10.18it/s, v_num=o30c, train/loss=1.790\r", + "Epoch 0: 6%| | 885/14932 [01:26<22:59, 10.18it/s, v_num=o30c, train/loss=4.090" ] }, { @@ -8169,7 +9208,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 54%[=========> ] 896.94M 1.50MB/s eta 3m 15s " + "Epoch 0: 6%| | 886/14932 [01:26<22:58, 10.19it/s, v_num=o30c, train/loss=4.090\r", + "Epoch 0: 6%| | 886/14932 [01:26<22:58, 10.19it/s, v_num=o30c, train/loss=2.980" ] }, { @@ -8177,7 +9217,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 54%[=========> ] 897.30M 1.50MB/s eta 3m 15s " + "Epoch 0: 6%| | 887/14932 [01:27<22:58, 10.19it/s, v_num=o30c, train/loss=2.980\r", + "Epoch 0: 6%| | 887/14932 [01:27<22:58, 10.19it/s, v_num=o30c, train/loss=2.450" ] }, { @@ -8185,7 +9226,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 54%[=========> ] 897.66M 1.46MB/s eta 3m 15s " + "Epoch 0: 6%| | 888/14932 [01:27<22:57, 10.20it/s, v_num=o30c, train/loss=2.450\r", + "Epoch 0: 6%| | 888/14932 [01:27<22:57, 10.20it/s, v_num=o30c, train/loss=3.020" ] }, { @@ -8193,7 +9235,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 54%[=========> ] 898.00M 1.57MB/s eta 3m 15s " + "Epoch 0: 6%| | 889/14932 [01:27<22:58, 10.19it/s, v_num=o30c, train/loss=3.020\r", + "Epoch 0: 6%| | 889/14932 [01:27<22:58, 10.19it/s, v_num=o30c, train/loss=4.410" ] }, { @@ -8201,7 +9244,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 54%[=========> ] 898.38M 1.44MB/s eta 3m 15s " + "Epoch 0: 6%| | 890/14932 [01:27<22:57, 10.19it/s, v_num=o30c, train/loss=4.410\r", + "Epoch 0: 6%| | 890/14932 [01:27<22:57, 10.19it/s, v_num=o30c, train/loss=3.280" ] }, { @@ -8209,7 +9253,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 54%[=========> ] 898.74M 1.44MB/s eta 3m 15s " + "Epoch 0: 6%| | 891/14932 [01:27<22:57, 10.20it/s, v_num=o30c, train/loss=3.280\r", + "Epoch 0: 6%| | 891/14932 [01:27<22:57, 10.20it/s, v_num=o30c, train/loss=2.980" ] }, { @@ -8217,7 +9262,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 54%[=========> ] 899.10M 1.48MB/s eta 3m 15s " + "Epoch 0: 6%| | 892/14932 [01:27<22:56, 10.20it/s, v_num=o30c, train/loss=2.980\r", + "Epoch 0: 6%| | 892/14932 [01:27<22:56, 10.20it/s, v_num=o30c, train/loss=4.030" ] }, { @@ -8225,7 +9271,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 54%[=========> ] 899.46M 1.47MB/s eta 3m 15s " + "Epoch 0: 6%| | 893/14932 [01:27<22:56, 10.20it/s, v_num=o30c, train/loss=4.030\r", + "Epoch 0: 6%| | 893/14932 [01:27<22:56, 10.20it/s, v_num=o30c, train/loss=3.340" ] }, { @@ -8233,7 +9280,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 54%[=========> ] 899.82M 1.46MB/s eta 3m 15s " + "Epoch 0: 6%| | 894/14932 [01:27<22:57, 10.19it/s, v_num=o30c, train/loss=3.340\r", + "Epoch 0: 6%| | 894/14932 [01:27<22:57, 10.19it/s, v_num=o30c, train/loss=4.810" ] }, { @@ -8241,7 +9289,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 54%[=========> ] 900.18M 1.48MB/s eta 3m 15s " + "Epoch 0: 6%| | 895/14932 [01:27<22:57, 10.19it/s, v_num=o30c, train/loss=4.810\r", + "Epoch 0: 6%| | 895/14932 [01:27<22:57, 10.19it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -8249,7 +9298,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 54%[=========> ] 900.54M 1.47MB/s eta 3m 15s " + "Epoch 0: 6%| | 896/14932 [01:27<22:58, 10.18it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 6%| | 896/14932 [01:27<22:58, 10.18it/s, v_num=o30c, train/loss=3.470" ] }, { @@ -8257,7 +9307,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 54%[=========> ] 900.89M 1.52MB/s eta 3m 15s " + "Epoch 0: 6%| | 897/14932 [01:28<22:57, 10.19it/s, v_num=o30c, train/loss=3.470\r", + "Epoch 0: 6%| | 897/14932 [01:28<22:57, 10.19it/s, v_num=o30c, train/loss=2.560" ] }, { @@ -8265,7 +9316,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 54%[=========> ] 901.25M 1.50MB/s eta 3m 15s " + "Epoch 0: 6%| | 898/14932 [01:28<22:57, 10.19it/s, v_num=o30c, train/loss=2.560\r", + "Epoch 0: 6%| | 898/14932 [01:28<22:57, 10.19it/s, v_num=o30c, train/loss=3.670" ] }, { @@ -8273,7 +9325,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 54%[=========> ] 901.58M 1.54MB/s eta 3m 15s " + "Epoch 0: 6%| | 899/14932 [01:28<22:56, 10.19it/s, v_num=o30c, train/loss=3.670\r", + "Epoch 0: 6%| | 899/14932 [01:28<22:56, 10.19it/s, v_num=o30c, train/loss=4.310" ] }, { @@ -8281,7 +9334,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 54%[=========> ] 901.89M 1.53MB/s eta 3m 15s " + "Epoch 0: 6%| | 900/14932 [01:28<22:56, 10.19it/s, v_num=o30c, train/loss=4.310\r", + "Epoch 0: 6%| | 900/14932 [01:28<22:56, 10.19it/s, v_num=o30c, train/loss=3.890" ] }, { @@ -8289,7 +9343,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 54%[=========> ] 902.25M 1.51MB/s eta 3m 15s " + "Epoch 0: 6%| | 901/14932 [01:28<22:56, 10.20it/s, v_num=o30c, train/loss=3.890\r", + "Epoch 0: 6%| | 901/14932 [01:28<22:56, 10.20it/s, v_num=o30c, train/loss=2.310" ] }, { @@ -8297,7 +9352,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 54%[=========> ] 902.63M 1.54MB/s eta 3m 15s " + "Epoch 0: 6%| | 902/14932 [01:28<22:55, 10.20it/s, v_num=o30c, train/loss=2.310\r", + "Epoch 0: 6%| | 902/14932 [01:28<22:55, 10.20it/s, v_num=o30c, train/loss=4.750" ] }, { @@ -8305,7 +9361,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 55%[==========> ] 902.99M 1.53MB/s eta 3m 16s " + "Epoch 0: 6%| | 903/14932 [01:28<22:54, 10.20it/s, v_num=o30c, train/loss=4.750\r", + "Epoch 0: 6%| | 903/14932 [01:28<22:54, 10.20it/s, v_num=o30c, train/loss=3.860" ] }, { @@ -8313,7 +9370,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 55%[==========> ] 903.35M 1.56MB/s eta 3m 16s " + "Epoch 0: 6%| | 904/14932 [01:28<22:54, 10.21it/s, v_num=o30c, train/loss=3.860\r", + "Epoch 0: 6%| | 904/14932 [01:28<22:54, 10.21it/s, v_num=o30c, train/loss=2.590" ] }, { @@ -8321,7 +9379,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 55%[==========> ] 903.72M 1.54MB/s eta 3m 16s " + "Epoch 0: 6%| | 905/14932 [01:28<22:54, 10.21it/s, v_num=o30c, train/loss=2.590\r", + "Epoch 0: 6%| | 905/14932 [01:28<22:54, 10.21it/s, v_num=o30c, train/loss=2.980" ] }, { @@ -8329,7 +9388,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 55%[==========> ] 904.08M 1.54MB/s eta 3m 16s " + "Epoch 0: 6%| | 906/14932 [01:28<22:53, 10.21it/s, v_num=o30c, train/loss=2.980\r", + "Epoch 0: 6%| | 906/14932 [01:28<22:53, 10.21it/s, v_num=o30c, train/loss=3.970" ] }, { @@ -8337,7 +9397,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 55%[==========> ] 904.46M 1.56MB/s eta 3m 16s " + "Epoch 0: 6%| | 907/14932 [01:28<22:53, 10.21it/s, v_num=o30c, train/loss=3.970\r", + "Epoch 0: 6%| | 907/14932 [01:28<22:53, 10.21it/s, v_num=o30c, train/loss=4.470" ] }, { @@ -8345,7 +9406,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 55%[==========> ] 904.82M 1.54MB/s eta 3m 16s " + "Epoch 0: 6%| | 908/14932 [01:28<22:52, 10.22it/s, v_num=o30c, train/loss=4.470\r", + "Epoch 0: 6%| | 908/14932 [01:28<22:52, 10.22it/s, v_num=o30c, train/loss=3.000" ] }, { @@ -8353,7 +9415,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 55%[==========> ] 905.19M 1.58MB/s eta 3m 16s " + "Epoch 0: 6%| | 909/14932 [01:28<22:52, 10.22it/s, v_num=o30c, train/loss=3.000\r", + "Epoch 0: 6%| | 909/14932 [01:28<22:52, 10.22it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -8361,7 +9424,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 55%[==========> ] 905.58M 1.58MB/s eta 3m 16s " + "Epoch 0: 6%| | 910/14932 [01:29<22:51, 10.22it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 6%| | 910/14932 [01:29<22:51, 10.22it/s, v_num=o30c, train/loss=4.560" ] }, { @@ -8369,7 +9433,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 55%[==========> ] 905.96M 1.56MB/s eta 3m 16s " + "Epoch 0: 6%| | 911/14932 [01:29<22:51, 10.22it/s, v_num=o30c, train/loss=4.560\r", + "Epoch 0: 6%| | 911/14932 [01:29<22:51, 10.22it/s, v_num=o30c, train/loss=2.980" ] }, { @@ -8377,7 +9442,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 55%[==========> ] 906.35M 1.59MB/s eta 3m 16s " + "Epoch 0: 6%| | 912/14932 [01:29<22:52, 10.21it/s, v_num=o30c, train/loss=2.980\r", + "Epoch 0: 6%| | 912/14932 [01:29<22:52, 10.21it/s, v_num=o30c, train/loss=4.060" ] }, { @@ -8385,7 +9451,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 55%[==========> ] 906.74M 1.58MB/s eta 3m 16s " + "Epoch 0: 6%| | 913/14932 [01:29<22:52, 10.22it/s, v_num=o30c, train/loss=4.060\r", + "Epoch 0: 6%| | 913/14932 [01:29<22:52, 10.22it/s, v_num=o30c, train/loss=3.690" ] }, { @@ -8393,7 +9460,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 55%[==========> ] 907.13M 1.62MB/s eta 3m 16s " + "Epoch 0: 6%| | 914/14932 [01:29<22:51, 10.22it/s, v_num=o30c, train/loss=3.690\r", + "Epoch 0: 6%| | 914/14932 [01:29<22:51, 10.22it/s, v_num=o30c, train/loss=3.310" ] }, { @@ -8401,7 +9469,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 55%[==========> ] 907.54M 1.62MB/s eta 3m 16s " + "Epoch 0: 6%| | 915/14932 [01:29<22:51, 10.22it/s, v_num=o30c, train/loss=3.310\r", + "Epoch 0: 6%| | 915/14932 [01:29<22:51, 10.22it/s, v_num=o30c, train/loss=2.340" ] }, { @@ -8409,7 +9478,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 55%[==========> ] 907.94M 1.59MB/s eta 3m 16s " + "Epoch 0: 6%| | 916/14932 [01:29<22:50, 10.22it/s, v_num=o30c, train/loss=2.340\r", + "Epoch 0: 6%| | 916/14932 [01:29<22:50, 10.22it/s, v_num=o30c, train/loss=4.090" ] }, { @@ -8417,7 +9487,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 55%[==========> ] 908.36M 1.65MB/s eta 3m 16s " + "Epoch 0: 6%| | 917/14932 [01:29<22:50, 10.23it/s, v_num=o30c, train/loss=4.090\r", + "Epoch 0: 6%| | 917/14932 [01:29<22:50, 10.23it/s, v_num=o30c, train/loss=1.410" ] }, { @@ -8425,7 +9496,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 55%[==========> ] 908.79M 1.65MB/s eta 3m 16s " + "Epoch 0: 6%| | 918/14932 [01:29<22:49, 10.23it/s, v_num=o30c, train/loss=1.410\r", + "Epoch 0: 6%| | 918/14932 [01:29<22:49, 10.23it/s, v_num=o30c, train/loss=4.470" ] }, { @@ -8433,7 +9505,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 55%[==========> ] 909.22M 1.70MB/s eta 3m 16s " + "Epoch 0: 6%| | 919/14932 [01:29<22:49, 10.23it/s, v_num=o30c, train/loss=4.470\r", + "Epoch 0: 6%| | 919/14932 [01:29<22:49, 10.23it/s, v_num=o30c, train/loss=4.250" ] }, { @@ -8441,7 +9514,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 55%[==========> ] 909.68M 1.70MB/s eta 3m 16s " + "Epoch 0: 6%| | 920/14932 [01:30<22:52, 10.21it/s, v_num=o30c, train/loss=4.250\r", + "Epoch 0: 6%| | 920/14932 [01:30<22:52, 10.21it/s, v_num=o30c, train/loss=3.880" ] }, { @@ -8449,7 +9523,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 55%[==========> ] 910.13M 1.70MB/s eta 3m 16s " + "Epoch 0: 6%| | 921/14932 [01:30<22:51, 10.22it/s, v_num=o30c, train/loss=3.880\r", + "Epoch 0: 6%| | 921/14932 [01:30<22:51, 10.22it/s, v_num=o30c, train/loss=3.610" ] }, { @@ -8457,7 +9532,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 55%[==========> ] 910.60M 1.75MB/s eta 3m 16s " + "Epoch 0: 6%| | 922/14932 [01:30<22:51, 10.22it/s, v_num=o30c, train/loss=3.610\r", + "Epoch 0: 6%| | 922/14932 [01:30<22:51, 10.22it/s, v_num=o30c, train/loss=3.750" ] }, { @@ -8465,7 +9541,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 55%[==========> ] 911.10M 1.79MB/s eta 3m 15s " + "Epoch 0: 6%| | 923/14932 [01:30<22:50, 10.22it/s, v_num=o30c, train/loss=3.750\r", + "Epoch 0: 6%| | 923/14932 [01:30<22:50, 10.22it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -8473,7 +9550,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 55%[==========> ] 911.60M 1.86MB/s eta 3m 15s " + "Epoch 0: 6%| | 924/14932 [01:30<22:50, 10.22it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 6%| | 924/14932 [01:30<22:50, 10.22it/s, v_num=o30c, train/loss=3.670" ] }, { @@ -8481,7 +9559,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 55%[==========> ] 912.11M 1.87MB/s eta 3m 15s " + "Epoch 0: 6%| | 925/14932 [01:30<22:49, 10.22it/s, v_num=o30c, train/loss=3.670\r", + "Epoch 0: 6%| | 925/14932 [01:30<22:49, 10.22it/s, v_num=o30c, train/loss=3.890" ] }, { @@ -8489,7 +9568,8 @@ "output_type": "stream", "text": [ "\r", - "pth 55%[==========> ] 912.64M 1.89MB/s eta 3m 15s " + "Epoch 0: 6%| | 926/14932 [01:30<22:49, 10.23it/s, v_num=o30c, train/loss=3.890\r", + "Epoch 0: 6%| | 926/14932 [01:30<22:49, 10.23it/s, v_num=o30c, train/loss=3.770" ] }, { @@ -8497,7 +9577,8 @@ "output_type": "stream", "text": [ "\r", - "th 55%[==========> ] 913.21M 1.97MB/s eta 3m 15s " + "Epoch 0: 6%| | 927/14932 [01:30<22:48, 10.23it/s, v_num=o30c, train/loss=3.770\r", + "Epoch 0: 6%| | 927/14932 [01:30<22:48, 10.23it/s, v_num=o30c, train/loss=4.030" ] }, { @@ -8505,7 +9586,8 @@ "output_type": "stream", "text": [ "\r", - "h 55%[==========> ] 913.79M 1.99MB/s eta 3m 15s " + "Epoch 0: 6%| | 928/14932 [01:30<22:50, 10.22it/s, v_num=o30c, train/loss=4.030\r", + "Epoch 0: 6%| | 928/14932 [01:30<22:50, 10.22it/s, v_num=o30c, train/loss=4.160" ] }, { @@ -8513,7 +9595,8 @@ "output_type": "stream", "text": [ "\r", - " 55%[==========> ] 914.38M 2.08MB/s eta 3m 15s " + "Epoch 0: 6%| | 929/14932 [01:30<22:49, 10.22it/s, v_num=o30c, train/loss=4.160\r", + "Epoch 0: 6%| | 929/14932 [01:30<22:49, 10.22it/s, v_num=o30c, train/loss=2.500" ] }, { @@ -8521,7 +9604,8 @@ "output_type": "stream", "text": [ "\r", - " v 55%[==========> ] 915.00M 2.12MB/s eta 3m 15s " + "Epoch 0: 6%| | 930/14932 [01:30<22:49, 10.23it/s, v_num=o30c, train/loss=2.500\r", + "Epoch 0: 6%| | 930/14932 [01:30<22:49, 10.23it/s, v_num=o30c, train/loss=4.220" ] }, { @@ -8529,7 +9613,8 @@ "output_type": "stream", "text": [ "\r", - " v5 55%[==========> ] 915.64M 2.21MB/s eta 3m 15s " + "Epoch 0: 6%| | 931/14932 [01:31<22:48, 10.23it/s, v_num=o30c, train/loss=4.220\r", + "Epoch 0: 6%| | 931/14932 [01:31<22:48, 10.23it/s, v_num=o30c, train/loss=1.840" ] }, { @@ -8537,7 +9622,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 55%[==========> ] 916.32M 2.26MB/s eta 3m 15s " + "Epoch 0: 6%| | 932/14932 [01:31<22:48, 10.23it/s, v_num=o30c, train/loss=1.840\r", + "Epoch 0: 6%| | 932/14932 [01:31<22:48, 10.23it/s, v_num=o30c, train/loss=1.380" ] }, { @@ -8545,7 +9631,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 55%[==========> ] 917.00M 2.37MB/s eta 3m 15s " + "Epoch 0: 6%| | 933/14932 [01:31<22:47, 10.23it/s, v_num=o30c, train/loss=1.380\r", + "Epoch 0: 6%| | 933/14932 [01:31<22:47, 10.23it/s, v_num=o30c, train/loss=2.530" ] }, { @@ -8553,7 +9640,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 55%[==========> ] 917.74M 2.42MB/s eta 3m 15s " + "Epoch 0: 6%| | 934/14932 [01:31<22:47, 10.24it/s, v_num=o30c, train/loss=2.530\r", + "Epoch 0: 6%| | 934/14932 [01:31<22:47, 10.24it/s, v_num=o30c, train/loss=2.860" ] }, { @@ -8561,7 +9649,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 55%[==========> ] 918.49M 2.54MB/s eta 3m 15s " + "Epoch 0: 6%| | 935/14932 [01:31<22:46, 10.24it/s, v_num=o30c, train/loss=2.860\r", + "Epoch 0: 6%| | 935/14932 [01:31<22:46, 10.24it/s, v_num=o30c, train/loss=3.550" ] }, { @@ -8569,7 +9658,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 56%[==========> ] 919.27M 2.63MB/s eta 3m 15s " + "Epoch 0: 6%| | 936/14932 [01:31<22:46, 10.24it/s, v_num=o30c, train/loss=3.550\r", + "Epoch 0: 6%| | 936/14932 [01:31<22:46, 10.24it/s, v_num=o30c, train/loss=2.780" ] }, { @@ -8577,7 +9667,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 56%[==========> ] 920.10M 2.77MB/s eta 3m 15s " + "Epoch 0: 6%| | 937/14932 [01:31<22:45, 10.25it/s, v_num=o30c, train/loss=2.780\r", + "Epoch 0: 6%| | 937/14932 [01:31<22:45, 10.25it/s, v_num=o30c, train/loss=2.050" ] }, { @@ -8585,7 +9676,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 56%[==========> ] 920.96M 2.85MB/s eta 3m 14s " + "Epoch 0: 6%| | 938/14932 [01:31<22:45, 10.25it/s, v_num=o30c, train/loss=2.050\r", + "Epoch 0: 6%| | 938/14932 [01:31<22:45, 10.25it/s, v_num=o30c, train/loss=3.920" ] }, { @@ -8593,7 +9685,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 56%[==========> ] 921.85M 3.01MB/s eta 3m 14s " + "Epoch 0: 6%| | 939/14932 [01:31<22:45, 10.25it/s, v_num=o30c, train/loss=3.920\r", + "Epoch 0: 6%| | 939/14932 [01:31<22:45, 10.25it/s, v_num=o30c, train/loss=3.450" ] }, { @@ -8601,7 +9694,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 56%[==========> ] 922.79M 3.09MB/s eta 3m 14s " + "Epoch 0: 6%| | 940/14932 [01:31<22:44, 10.25it/s, v_num=o30c, train/loss=3.450\r", + "Epoch 0: 6%| | 940/14932 [01:31<22:44, 10.25it/s, v_num=o30c, train/loss=2.880" ] }, { @@ -8609,7 +9703,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 56%[==========> ] 923.75M 3.26MB/s eta 3m 14s " + "Epoch 0: 6%| | 941/14932 [01:31<22:44, 10.26it/s, v_num=o30c, train/loss=2.880\r", + "Epoch 0: 6%| | 941/14932 [01:31<22:44, 10.26it/s, v_num=o30c, train/loss=2.750" ] }, { @@ -8617,7 +9712,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 56%[==========> ] 924.77M 3.35MB/s eta 3m 14s " + "Epoch 0: 6%| | 942/14932 [01:31<22:43, 10.26it/s, v_num=o30c, train/loss=2.750\r", + "Epoch 0: 6%| | 942/14932 [01:31<22:43, 10.26it/s, v_num=o30c, train/loss=3.480" ] }, { @@ -8625,7 +9721,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 56%[==========> ] 925.83M 3.47MB/s eta 3m 12s " + "Epoch 0: 6%| | 943/14932 [01:31<22:43, 10.26it/s, v_num=o30c, train/loss=3.480\r", + "Epoch 0: 6%| | 943/14932 [01:31<22:43, 10.26it/s, v_num=o30c, train/loss=4.090" ] }, { @@ -8633,7 +9730,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 56%[==========> ] 926.27M 3.50MB/s eta 3m 12s " + "Epoch 0: 6%| | 944/14932 [01:31<22:42, 10.26it/s, v_num=o30c, train/loss=4.090\r", + "Epoch 0: 6%| | 944/14932 [01:31<22:42, 10.26it/s, v_num=o30c, train/loss=3.970" ] }, { @@ -8641,7 +9739,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 56%[==========> ] 927.27M 3.57MB/s eta 3m 12s " + "Epoch 0: 6%| | 945/14932 [01:32<22:42, 10.27it/s, v_num=o30c, train/loss=3.970\r", + "Epoch 0: 6%| | 945/14932 [01:32<22:42, 10.27it/s, v_num=o30c, train/loss=2.840" ] }, { @@ -8649,7 +9748,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 56%[==========> ] 928.44M 3.79MB/s eta 3m 12s " + "Epoch 0: 6%| | 946/14932 [01:32<22:41, 10.27it/s, v_num=o30c, train/loss=2.840\r", + "Epoch 0: 6%| | 946/14932 [01:32<22:41, 10.27it/s, v_num=o30c, train/loss=3.270" ] }, { @@ -8657,7 +9757,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 56%[==========> ] 929.63M 3.93MB/s eta 3m 12s " + "Epoch 0: 6%| | 947/14932 [01:32<22:41, 10.27it/s, v_num=o30c, train/loss=3.270\r", + "Epoch 0: 6%| | 947/14932 [01:32<22:41, 10.27it/s, v_num=o30c, train/loss=4.090" ] }, { @@ -8665,7 +9766,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 56%[==========> ] 930.83M 4.15MB/s eta 3m 11s " + "Epoch 0: 6%| | 948/14932 [01:32<22:40, 10.28it/s, v_num=o30c, train/loss=4.090\r", + "Epoch 0: 6%| | 948/14932 [01:32<22:40, 10.27it/s, v_num=o30c, train/loss=4.690" ] }, { @@ -8673,7 +9775,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 56%[==========> ] 932.14M 4.30MB/s eta 3m 11s " + "Epoch 0: 6%| | 949/14932 [01:32<22:40, 10.28it/s, v_num=o30c, train/loss=4.690\r", + "Epoch 0: 6%| | 949/14932 [01:32<22:40, 10.28it/s, v_num=o30c, train/loss=3.380" ] }, { @@ -8681,7 +9784,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 56%[==========> ] 933.46M 4.57MB/s eta 3m 11s " + "Epoch 0: 6%| | 950/14932 [01:32<22:40, 10.28it/s, v_num=o30c, train/loss=3.380\r", + "Epoch 0: 6%| | 950/14932 [01:32<22:40, 10.28it/s, v_num=o30c, train/loss=3.250" ] }, { @@ -8689,7 +9793,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 56%[==========> ] 934.80M 4.68MB/s eta 3m 11s " + "Epoch 0: 6%| | 951/14932 [01:32<22:39, 10.28it/s, v_num=o30c, train/loss=3.250\r", + "Epoch 0: 6%| | 951/14932 [01:32<22:39, 10.28it/s, v_num=o30c, train/loss=3.200" ] }, { @@ -8697,7 +9802,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 57%[==========> ] 936.21M 4.82MB/s eta 3m 11s " + "Epoch 0: 6%| | 952/14932 [01:32<22:39, 10.29it/s, v_num=o30c, train/loss=3.200\r", + "Epoch 0: 6%| | 952/14932 [01:32<22:39, 10.29it/s, v_num=o30c, train/loss=3.920" ] }, { @@ -8705,7 +9811,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 57%[==========> ] 937.75M 5.13MB/s eta 3m 8s " + "Epoch 0: 6%| | 953/14932 [01:32<22:38, 10.29it/s, v_num=o30c, train/loss=3.920\r", + "Epoch 0: 6%| | 953/14932 [01:32<22:38, 10.29it/s, v_num=o30c, train/loss=3.190" ] }, { @@ -8713,7 +9820,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 57%[==========> ] 939.38M 5.32MB/s eta 3m 8s " + "Epoch 0: 6%| | 954/14932 [01:32<22:38, 10.29it/s, v_num=o30c, train/loss=3.190\r", + "Epoch 0: 6%| | 954/14932 [01:32<22:38, 10.29it/s, v_num=o30c, train/loss=3.980" ] }, { @@ -8721,7 +9829,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 57%[==========> ] 940.04M 5.35MB/s eta 3m 8s " + "Epoch 0: 6%| | 955/14932 [01:32<22:37, 10.29it/s, v_num=o30c, train/loss=3.980\r", + "Epoch 0: 6%| | 955/14932 [01:32<22:37, 10.29it/s, v_num=o30c, train/loss=4.340" ] }, { @@ -8729,7 +9838,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 57%[==========> ] 941.57M 5.55MB/s eta 3m 8s " + "Epoch 0: 6%| | 956/14932 [01:32<22:37, 10.30it/s, v_num=o30c, train/loss=4.340\r", + "Epoch 0: 6%| | 956/14932 [01:32<22:37, 10.30it/s, v_num=o30c, train/loss=2.980" ] }, { @@ -8737,7 +9847,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 57%[==========> ] 943.33M 5.91MB/s eta 3m 8s " + "Epoch 0: 6%| | 957/14932 [01:32<22:37, 10.30it/s, v_num=o30c, train/loss=2.980\r", + "Epoch 0: 6%| | 957/14932 [01:32<22:37, 10.30it/s, v_num=o30c, train/loss=3.860" ] }, { @@ -8745,7 +9856,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 57%[==========> ] 945.04M 6.09MB/s eta 3m 6s " + "Epoch 0: 6%| | 958/14932 [01:33<22:36, 10.30it/s, v_num=o30c, train/loss=3.860\r", + "Epoch 0: 6%| | 958/14932 [01:33<22:36, 10.30it/s, v_num=o30c, train/loss=3.250" ] }, { @@ -8753,7 +9865,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 57%[==========> ] 946.97M 6.23MB/s eta 3m 6s " + "Epoch 0: 6%| | 959/14932 [01:33<22:36, 10.30it/s, v_num=o30c, train/loss=3.250\r", + "Epoch 0: 6%| | 959/14932 [01:33<22:36, 10.30it/s, v_num=o30c, train/loss=2.750" ] }, { @@ -8761,7 +9874,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 57%[==========> ] 948.97M 6.66MB/s eta 3m 6s " + "Epoch 0: 6%| | 960/14932 [01:33<22:37, 10.29it/s, v_num=o30c, train/loss=2.750\r", + "Epoch 0: 6%| | 960/14932 [01:33<22:37, 10.29it/s, v_num=o30c, train/loss=3.580" ] }, { @@ -8769,7 +9883,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 57%[==========> ] 950.96M 6.88MB/s eta 3m 6s " + "Epoch 0: 6%| | 961/14932 [01:33<22:36, 10.30it/s, v_num=o30c, train/loss=3.580\r", + "Epoch 0: 6%| | 961/14932 [01:33<22:36, 10.30it/s, v_num=o30c, train/loss=3.810" ] }, { @@ -8777,7 +9892,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 58%[==========> ] 952.97M 7.26MB/s eta 3m 6s " + "Epoch 0: 6%| | 962/14932 [01:33<22:36, 10.30it/s, v_num=o30c, train/loss=3.810\r", + "Epoch 0: 6%| | 962/14932 [01:33<22:36, 10.30it/s, v_num=o30c, train/loss=3.120" ] }, { @@ -8785,7 +9901,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 58%[==========> ] 955.11M 7.48MB/s eta 3m 2s " + "Epoch 0: 6%| | 963/14932 [01:33<22:36, 10.30it/s, v_num=o30c, train/loss=3.120\r", + "Epoch 0: 6%| | 963/14932 [01:33<22:36, 10.30it/s, v_num=o30c, train/loss=2.390" ] }, { @@ -8793,7 +9910,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 58%[==========> ] 957.44M 8.00MB/s eta 3m 2s " + "Epoch 0: 6%| | 964/14932 [01:33<22:35, 10.30it/s, v_num=o30c, train/loss=2.390\r", + "Epoch 0: 6%| | 964/14932 [01:33<22:35, 10.30it/s, v_num=o30c, train/loss=3.420" ] }, { @@ -8801,7 +9919,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 58%[==========> ] 958.36M 7.80MB/s eta 3m 2s " + "Epoch 0: 6%| | 965/14932 [01:33<22:35, 10.31it/s, v_num=o30c, train/loss=3.420\r", + "Epoch 0: 6%| | 965/14932 [01:33<22:35, 10.31it/s, v_num=o30c, train/loss=1.610" ] }, { @@ -8809,7 +9928,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 58%[==========> ] 960.55M 7.92MB/s eta 3m 2s " + "Epoch 0: 6%| | 966/14932 [01:33<22:34, 10.31it/s, v_num=o30c, train/loss=1.610\r", + "Epoch 0: 6%| | 966/14932 [01:33<22:34, 10.31it/s, v_num=o30c, train/loss=3.590" ] }, { @@ -8817,7 +9937,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 58%[==========> ] 963.10M 8.40MB/s eta 3m 2s " + "Epoch 0: 6%| | 967/14932 [01:33<22:34, 10.31it/s, v_num=o30c, train/loss=3.590\r", + "Epoch 0: 6%| | 967/14932 [01:33<22:34, 10.31it/s, v_num=o30c, train/loss=3.250" ] }, { @@ -8825,7 +9946,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 58%[==========> ] 965.74M 8.62MB/s eta 2m 58s " + "Epoch 0: 6%| | 968/14932 [01:33<22:34, 10.31it/s, v_num=o30c, train/loss=3.250\r", + "Epoch 0: 6%| | 968/14932 [01:33<22:34, 10.31it/s, v_num=o30c, train/loss=3.410" ] }, { @@ -8833,7 +9955,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 59%[==========> ] 968.47M 9.09MB/s eta 2m 58s " + "Epoch 0: 6%| | 969/14932 [01:33<22:33, 10.32it/s, v_num=o30c, train/loss=3.410\r", + "Epoch 0: 6%| | 969/14932 [01:33<22:33, 10.32it/s, v_num=o30c, train/loss=2.550" ] }, { @@ -8841,7 +9964,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 59%[==========> ] 971.32M 9.30MB/s eta 2m 58s " + "Epoch 0: 6%| | 970/14932 [01:34<22:33, 10.32it/s, v_num=o30c, train/loss=2.550\r", + "Epoch 0: 6%| | 970/14932 [01:34<22:33, 10.32it/s, v_num=o30c, train/loss=5.030" ] }, { @@ -8849,7 +9973,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 59%[==========> ] 974.27M 9.82MB/s eta 2m 58s " + "Epoch 0: 7%| | 971/14932 [01:34<22:32, 10.32it/s, v_num=o30c, train/loss=5.030\r", + "Epoch 0: 7%| | 971/14932 [01:34<22:32, 10.32it/s, v_num=o30c, train/loss=3.620" ] }, { @@ -8857,7 +9982,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 59%[==========> ] 977.36M 10.1MB/s eta 2m 58s " + "Epoch 0: 7%| | 972/14932 [01:34<22:32, 10.32it/s, v_num=o30c, train/loss=3.620\r", + "Epoch 0: 7%| | 972/14932 [01:34<22:32, 10.32it/s, v_num=o30c, train/loss=3.780" ] }, { @@ -8865,7 +9991,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 59%[==========> ] 980.57M 10.6MB/s eta 2m 52s " + "Epoch 0: 7%| | 973/14932 [01:34<22:32, 10.32it/s, v_num=o30c, train/loss=3.780\r", + "Epoch 0: 7%| | 973/14932 [01:34<22:32, 10.32it/s, v_num=o30c, train/loss=3.970" ] }, { @@ -8873,7 +10000,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 59%[==========> ] 983.00M 10.7MB/s eta 2m 52s " + "Epoch 0: 7%| | 974/14932 [01:34<22:31, 10.33it/s, v_num=o30c, train/loss=3.970\r", + "Epoch 0: 7%| | 974/14932 [01:34<22:31, 10.33it/s, v_num=o30c, train/loss=3.080" ] }, { @@ -8881,7 +10009,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 60%[===========> ] 986.29M 11.3MB/s eta 2m 52s " + "Epoch 0: 7%| | 975/14932 [01:34<22:31, 10.33it/s, v_num=o30c, train/loss=3.080\r", + "Epoch 0: 7%| | 975/14932 [01:34<22:31, 10.33it/s, v_num=o30c, train/loss=2.830" ] }, { @@ -8889,7 +10018,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 60%[===========> ] 989.66M 11.5MB/s eta 2m 52s " + "Epoch 0: 7%| | 976/14932 [01:34<22:31, 10.33it/s, v_num=o30c, train/loss=2.830\r", + "Epoch 0: 7%| | 976/14932 [01:34<22:31, 10.33it/s, v_num=o30c, train/loss=4.440" ] }, { @@ -8897,7 +10027,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 60%[===========> ] 993.21M 12.0MB/s eta 2m 52s " + "Epoch 0: 7%| | 977/14932 [01:34<22:31, 10.33it/s, v_num=o30c, train/loss=4.440\r", + "Epoch 0: 7%| | 977/14932 [01:34<22:31, 10.33it/s, v_num=o30c, train/loss=3.340" ] }, { @@ -8905,7 +10036,8 @@ "output_type": "stream", "text": [ "\r", - "pth 60%[===========> ] 996.86M 12.4MB/s eta 2m 46s " + "Epoch 0: 7%| | 978/14932 [01:34<22:30, 10.33it/s, v_num=o30c, train/loss=3.340\r", + "Epoch 0: 7%| | 978/14932 [01:34<22:30, 10.33it/s, v_num=o30c, train/loss=1.880" ] }, { @@ -8913,7 +10045,8 @@ "output_type": "stream", "text": [ "\r", - "th 60%[===========> ] 1001M 12.9MB/s eta 2m 46s " + "Epoch 0: 7%| | 979/14932 [01:34<22:30, 10.33it/s, v_num=o30c, train/loss=1.880\r", + "Epoch 0: 7%| | 979/14932 [01:34<22:30, 10.33it/s, v_num=o30c, train/loss=3.470" ] }, { @@ -8921,7 +10054,8 @@ "output_type": "stream", "text": [ "\r", - "h 61%[===========> ] 1004M 13.0MB/s eta 2m 46s " + "Epoch 0: 7%| | 980/14932 [01:34<22:29, 10.34it/s, v_num=o30c, train/loss=3.470\r", + "Epoch 0: 7%| | 980/14932 [01:34<22:29, 10.34it/s, v_num=o30c, train/loss=4.190" ] }, { @@ -8929,7 +10063,8 @@ "output_type": "stream", "text": [ "\r", - " 61%[===========> ] 1008M 13.5MB/s eta 2m 46s " + "Epoch 0: 7%| | 981/14932 [01:34<22:29, 10.34it/s, v_num=o30c, train/loss=4.190\r", + "Epoch 0: 7%| | 981/14932 [01:34<22:29, 10.34it/s, v_num=o30c, train/loss=3.550" ] }, { @@ -8937,7 +10072,8 @@ "output_type": "stream", "text": [ "\r", - " v 61%[===========> ] 1012M 13.6MB/s eta 2m 40s " + "Epoch 0: 7%| | 982/14932 [01:34<22:29, 10.34it/s, v_num=o30c, train/loss=3.550\r", + "Epoch 0: 7%| | 982/14932 [01:34<22:29, 10.34it/s, v_num=o30c, train/loss=3.580" ] }, { @@ -8945,7 +10081,8 @@ "output_type": "stream", "text": [ "\r", - " v5 61%[===========> ] 1016M 13.8MB/s eta 2m 40s " + "Epoch 0: 7%| | 983/14932 [01:35<22:28, 10.34it/s, v_num=o30c, train/loss=3.580\r", + "Epoch 0: 7%| | 983/14932 [01:35<22:28, 10.34it/s, v_num=o30c, train/loss=2.560" ] }, { @@ -8953,7 +10090,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 62%[===========> ] 1020M 14.0MB/s eta 2m 40s " + "Epoch 0: 7%| | 984/14932 [01:35<22:28, 10.35it/s, v_num=o30c, train/loss=2.560\r", + "Epoch 0: 7%| | 984/14932 [01:35<22:28, 10.35it/s, v_num=o30c, train/loss=2.160" ] }, { @@ -8961,7 +10099,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 62%[===========> ] 1023M 14.1MB/s eta 2m 40s " + "Epoch 0: 7%| | 985/14932 [01:35<22:27, 10.35it/s, v_num=o30c, train/loss=2.160\r", + "Epoch 0: 7%| | 985/14932 [01:35<22:27, 10.35it/s, v_num=o30c, train/loss=2.620" ] }, { @@ -8969,7 +10108,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 62%[===========> ] 1.00G 14.0MB/s eta 2m 40s " + "Epoch 0: 7%| | 986/14932 [01:35<22:27, 10.35it/s, v_num=o30c, train/loss=2.620\r", + "Epoch 0: 7%| | 986/14932 [01:35<22:27, 10.35it/s, v_num=o30c, train/loss=4.120" ] }, { @@ -8977,7 +10117,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 62%[===========> ] 1.00G 14.2MB/s eta 2m 34s " + "Epoch 0: 7%| | 987/14932 [01:35<22:26, 10.35it/s, v_num=o30c, train/loss=4.120\r", + "Epoch 0: 7%| | 987/14932 [01:35<22:26, 10.35it/s, v_num=o30c, train/loss=3.080" ] }, { @@ -8985,7 +10126,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 62%[===========> ] 1.01G 14.4MB/s eta 2m 34s " + "Epoch 0: 7%| | 988/14932 [01:35<22:26, 10.36it/s, v_num=o30c, train/loss=3.080\r", + "Epoch 0: 7%| | 988/14932 [01:35<22:26, 10.36it/s, v_num=o30c, train/loss=3.750" ] }, { @@ -8993,7 +10135,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 63%[===========> ] 1.01G 14.5MB/s eta 2m 34s " + "Epoch 0: 7%| | 989/14932 [01:35<22:26, 10.36it/s, v_num=o30c, train/loss=3.750\r", + "Epoch 0: 7%| | 989/14932 [01:35<22:26, 10.36it/s, v_num=o30c, train/loss=2.360" ] }, { @@ -9001,7 +10144,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 63%[===========> ] 1.01G 13.6MB/s eta 2m 34s " + "Epoch 0: 7%| | 990/14932 [01:35<22:25, 10.36it/s, v_num=o30c, train/loss=2.360\r", + "Epoch 0: 7%| | 990/14932 [01:35<22:25, 10.36it/s, v_num=o30c, train/loss=1.910" ] }, { @@ -9009,7 +10153,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 63%[===========> ] 1.02G 13.9MB/s eta 2m 29s " + "Epoch 0: 7%| | 991/14932 [01:35<22:25, 10.36it/s, v_num=o30c, train/loss=1.910\r", + "Epoch 0: 7%| | 991/14932 [01:35<22:25, 10.36it/s, v_num=o30c, train/loss=3.700" ] }, { @@ -9017,7 +10162,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 63%[===========> ] 1.02G 13.7MB/s eta 2m 29s " + "Epoch 0: 7%| | 992/14932 [01:35<22:26, 10.35it/s, v_num=o30c, train/loss=3.700\r", + "Epoch 0: 7%| | 992/14932 [01:35<22:26, 10.35it/s, v_num=o30c, train/loss=3.140" ] }, { @@ -9025,7 +10171,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 63%[===========> ] 1.02G 13.7MB/s eta 2m 29s " + "Epoch 0: 7%| | 993/14932 [01:35<22:25, 10.36it/s, v_num=o30c, train/loss=3.140\r", + "Epoch 0: 7%| | 993/14932 [01:35<22:25, 10.36it/s, v_num=o30c, train/loss=3.360" ] }, { @@ -9033,7 +10180,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 64%[===========> ] 1.03G 13.7MB/s eta 2m 29s " + "Epoch 0: 7%| | 994/14932 [01:35<22:25, 10.36it/s, v_num=o30c, train/loss=3.360\r", + "Epoch 0: 7%| | 994/14932 [01:35<22:25, 10.36it/s, v_num=o30c, train/loss=3.830" ] }, { @@ -9041,7 +10189,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 64%[===========> ] 1.03G 13.5MB/s eta 2m 29s " + "Epoch 0: 7%| | 995/14932 [01:36<22:25, 10.36it/s, v_num=o30c, train/loss=3.830\r", + "Epoch 0: 7%| | 995/14932 [01:36<22:25, 10.36it/s, v_num=o30c, train/loss=2.830" ] }, { @@ -9049,7 +10198,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 64%[===========> ] 1.03G 13.6MB/s eta 2m 23s " + "Epoch 0: 7%| | 996/14932 [01:36<22:24, 10.36it/s, v_num=o30c, train/loss=2.830\r", + "Epoch 0: 7%| | 996/14932 [01:36<22:24, 10.36it/s, v_num=o30c, train/loss=2.610" ] }, { @@ -9057,7 +10207,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 64%[===========> ] 1.04G 13.7MB/s eta 2m 23s " + "Epoch 0: 7%| | 997/14932 [01:36<22:24, 10.37it/s, v_num=o30c, train/loss=2.610\r", + "Epoch 0: 7%| | 997/14932 [01:36<22:24, 10.37it/s, v_num=o30c, train/loss=2.730" ] }, { @@ -9065,7 +10216,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 64%[===========> ] 1.04G 13.6MB/s eta 2m 23s " + "Epoch 0: 7%| | 998/14932 [01:36<22:23, 10.37it/s, v_num=o30c, train/loss=2.730\r", + "Epoch 0: 7%| | 998/14932 [01:36<22:23, 10.37it/s, v_num=o30c, train/loss=3.950" ] }, { @@ -9073,7 +10225,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 65%[============> ] 1.04G 13.4MB/s eta 2m 23s " + "Epoch 0: 7%| | 999/14932 [01:36<22:23, 10.37it/s, v_num=o30c, train/loss=3.950\r", + "Epoch 0: 7%| | 999/14932 [01:36<22:23, 10.37it/s, v_num=o30c, train/loss=3.830" ] }, { @@ -9081,7 +10234,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 65%[============> ] 1.05G 13.4MB/s eta 2m 23s " + "Epoch 0: 7%| | 1000/14932 [01:36<22:22, 10.37it/s, v_num=o30c, train/loss=3.83\r", + "Epoch 0: 7%| | 1000/14932 [01:36<22:22, 10.37it/s, v_num=o30c, train/loss=4.44" ] }, { @@ -9089,7 +10243,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 65%[============> ] 1.05G 13.3MB/s eta 2m 18s " + "Epoch 0: 7%| | 1001/14932 [01:36<22:22, 10.38it/s, v_num=o30c, train/loss=4.44\r", + "Epoch 0: 7%| | 1001/14932 [01:36<22:22, 10.38it/s, v_num=o30c, train/loss=3.91" ] }, { @@ -9097,7 +10252,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 65%[============> ] 1.05G 13.5MB/s eta 2m 18s " + "Epoch 0: 7%| | 1002/14932 [01:36<22:22, 10.38it/s, v_num=o30c, train/loss=3.91\r", + "Epoch 0: 7%| | 1002/14932 [01:36<22:22, 10.38it/s, v_num=o30c, train/loss=3.20" ] }, { @@ -9105,7 +10261,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 65%[============> ] 1.06G 13.3MB/s eta 2m 18s " + "Epoch 0: 7%| | 1003/14932 [01:36<22:21, 10.38it/s, v_num=o30c, train/loss=3.20\r", + "Epoch 0: 7%| | 1003/14932 [01:36<22:21, 10.38it/s, v_num=o30c, train/loss=3.19" ] }, { @@ -9113,7 +10270,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 66%[============> ] 1.06G 13.3MB/s eta 2m 18s " + "Epoch 0: 7%| | 1004/14932 [01:36<22:21, 10.39it/s, v_num=o30c, train/loss=3.19\r", + "Epoch 0: 7%| | 1004/14932 [01:36<22:21, 10.39it/s, v_num=o30c, train/loss=3.22" ] }, { @@ -9121,7 +10279,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 66%[============> ] 1.06G 13.5MB/s eta 2m 18s " + "Epoch 0: 7%| | 1005/14932 [01:36<22:20, 10.39it/s, v_num=o30c, train/loss=3.22\r", + "Epoch 0: 7%| | 1005/14932 [01:36<22:20, 10.39it/s, v_num=o30c, train/loss=2.44" ] }, { @@ -9129,7 +10288,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 66%[============> ] 1.07G 13.7MB/s eta 2m 12s " + "Epoch 0: 7%| | 1006/14932 [01:36<22:21, 10.38it/s, v_num=o30c, train/loss=2.44\r", + "Epoch 0: 7%| | 1006/14932 [01:36<22:21, 10.38it/s, v_num=o30c, train/loss=3.19" ] }, { @@ -9137,7 +10297,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 66%[============> ] 1.07G 13.7MB/s eta 2m 12s " + "Epoch 0: 7%| | 1007/14932 [01:36<22:20, 10.39it/s, v_num=o30c, train/loss=3.19\r", + "Epoch 0: 7%| | 1007/14932 [01:36<22:20, 10.39it/s, v_num=o30c, train/loss=3.81" ] }, { @@ -9145,7 +10306,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 67%[============> ] 1.08G 13.6MB/s eta 2m 12s " + "Epoch 0: 7%| | 1008/14932 [01:37<22:20, 10.39it/s, v_num=o30c, train/loss=3.81\r", + "Epoch 0: 7%| | 1008/14932 [01:37<22:20, 10.39it/s, v_num=o30c, train/loss=2.41" ] }, { @@ -9153,7 +10315,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 67%[============> ] 1.08G 13.7MB/s eta 2m 12s " + "Epoch 0: 7%| | 1009/14932 [01:37<22:19, 10.39it/s, v_num=o30c, train/loss=2.41\r", + "Epoch 0: 7%| | 1009/14932 [01:37<22:19, 10.39it/s, v_num=o30c, train/loss=3.70" ] }, { @@ -9161,7 +10324,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 67%[============> ] 1.08G 14.6MB/s eta 2m 12s " + "Epoch 0: 7%| | 1010/14932 [01:37<22:19, 10.39it/s, v_num=o30c, train/loss=3.70\r", + "Epoch 0: 7%| | 1010/14932 [01:37<22:19, 10.39it/s, v_num=o30c, train/loss=2.52" ] }, { @@ -9169,7 +10333,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 67%[============> ] 1.09G 14.5MB/s eta 2m 6s " + "Epoch 0: 7%| | 1011/14932 [01:37<22:18, 10.40it/s, v_num=o30c, train/loss=2.52\r", + "Epoch 0: 7%| | 1011/14932 [01:37<22:18, 10.40it/s, v_num=o30c, train/loss=4.09" ] }, { @@ -9177,7 +10342,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 68%[============> ] 1.09G 14.7MB/s eta 2m 6s " + "Epoch 0: 7%| | 1012/14932 [01:37<22:18, 10.40it/s, v_num=o30c, train/loss=4.09\r", + "Epoch 0: 7%| | 1012/14932 [01:37<22:18, 10.40it/s, v_num=o30c, train/loss=3.36" ] }, { @@ -9185,7 +10351,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 68%[============> ] 1.09G 14.8MB/s eta 2m 6s " + "Epoch 0: 7%| | 1013/14932 [01:37<22:18, 10.40it/s, v_num=o30c, train/loss=3.36\r", + "Epoch 0: 7%| | 1013/14932 [01:37<22:18, 10.40it/s, v_num=o30c, train/loss=3.56" ] }, { @@ -9193,7 +10360,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 68%[============> ] 1.09G 13.8MB/s eta 2m 6s " + "Epoch 0: 7%| | 1014/14932 [01:37<22:17, 10.41it/s, v_num=o30c, train/loss=3.56\r", + "Epoch 0: 7%| | 1014/14932 [01:37<22:17, 10.41it/s, v_num=o30c, train/loss=3.27" ] }, { @@ -9201,7 +10369,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 68%[============> ] 1.10G 13.9MB/s eta 2m 2s " + "Epoch 0: 7%| | 1015/14932 [01:37<22:17, 10.41it/s, v_num=o30c, train/loss=3.27\r", + "Epoch 0: 7%| | 1015/14932 [01:37<22:17, 10.41it/s, v_num=o30c, train/loss=2.38" ] }, { @@ -9209,7 +10378,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 68%[============> ] 1.10G 13.8MB/s eta 2m 2s " + "Epoch 0: 7%| | 1016/14932 [01:37<22:16, 10.41it/s, v_num=o30c, train/loss=2.38\r", + "Epoch 0: 7%| | 1016/14932 [01:37<22:16, 10.41it/s, v_num=o30c, train/loss=3.23" ] }, { @@ -9217,7 +10387,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 68%[============> ] 1.10G 13.5MB/s eta 2m 2s " + "Epoch 0: 7%| | 1017/14932 [01:37<22:16, 10.41it/s, v_num=o30c, train/loss=3.23\r", + "Epoch 0: 7%| | 1017/14932 [01:37<22:16, 10.41it/s, v_num=o30c, train/loss=3.33" ] }, { @@ -9225,7 +10396,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 69%[============> ] 1.11G 13.6MB/s eta 2m 2s " + "Epoch 0: 7%| | 1018/14932 [01:37<22:15, 10.42it/s, v_num=o30c, train/loss=3.33\r", + "Epoch 0: 7%| | 1018/14932 [01:37<22:15, 10.42it/s, v_num=o30c, train/loss=4.41" ] }, { @@ -9233,7 +10405,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 69%[============> ] 1.11G 12.6MB/s eta 1m 59s " + "Epoch 0: 7%| | 1019/14932 [01:37<22:15, 10.42it/s, v_num=o30c, train/loss=4.41\r", + "Epoch 0: 7%| | 1019/14932 [01:37<22:15, 10.42it/s, v_num=o30c, train/loss=2.45" ] }, { @@ -9241,7 +10414,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 69%[============> ] 1.11G 12.1MB/s eta 1m 59s " + "Epoch 0: 7%| | 1020/14932 [01:37<22:14, 10.42it/s, v_num=o30c, train/loss=2.45\r", + "Epoch 0: 7%| | 1020/14932 [01:37<22:14, 10.42it/s, v_num=o30c, train/loss=2.94" ] }, { @@ -9249,7 +10423,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 69%[============> ] 1.11G 11.7MB/s eta 1m 59s " + "Epoch 0: 7%| | 1021/14932 [01:37<22:14, 10.42it/s, v_num=o30c, train/loss=2.94\r", + "Epoch 0: 7%| | 1021/14932 [01:37<22:14, 10.42it/s, v_num=o30c, train/loss=3.02" ] }, { @@ -9257,7 +10432,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 69%[============> ] 1.12G 11.3MB/s eta 1m 59s " + "Epoch 0: 7%| | 1022/14932 [01:38<22:14, 10.43it/s, v_num=o30c, train/loss=3.02\r", + "Epoch 0: 7%| | 1022/14932 [01:38<22:14, 10.43it/s, v_num=o30c, train/loss=2.22" ] }, { @@ -9265,7 +10441,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 69%[============> ] 1.12G 10.9MB/s eta 1m 56s " + "Epoch 0: 7%| | 1023/14932 [01:38<22:13, 10.43it/s, v_num=o30c, train/loss=2.22\r", + "Epoch 0: 7%| | 1023/14932 [01:38<22:13, 10.43it/s, v_num=o30c, train/loss=3.39" ] }, { @@ -9273,7 +10450,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 69%[============> ] 1.12G 10.5MB/s eta 1m 56s " + "Epoch 0: 7%| | 1024/14932 [01:38<22:14, 10.42it/s, v_num=o30c, train/loss=3.39\r", + "Epoch 0: 7%| | 1024/14932 [01:38<22:14, 10.42it/s, v_num=o30c, train/loss=1.48" ] }, { @@ -9281,7 +10459,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 69%[============> ] 1.12G 10.0MB/s eta 1m 56s " + "Epoch 0: 7%| | 1025/14932 [01:38<22:14, 10.42it/s, v_num=o30c, train/loss=1.48\r", + "Epoch 0: 7%| | 1025/14932 [01:38<22:14, 10.42it/s, v_num=o30c, train/loss=3.80" ] }, { @@ -9289,7 +10468,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 70%[=============> ] 1.12G 9.64MB/s eta 1m 56s " + "Epoch 0: 7%| | 1026/14932 [01:38<22:14, 10.42it/s, v_num=o30c, train/loss=3.80\r", + "Epoch 0: 7%| | 1026/14932 [01:38<22:14, 10.42it/s, v_num=o30c, train/loss=3.22" ] }, { @@ -9297,7 +10477,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 70%[=============> ] 1.12G 9.20MB/s eta 1m 56s " + "Epoch 0: 7%| | 1027/14932 [01:38<22:13, 10.43it/s, v_num=o30c, train/loss=3.22\r", + "Epoch 0: 7%| | 1027/14932 [01:38<22:13, 10.43it/s, v_num=o30c, train/loss=3.11" ] }, { @@ -9305,7 +10486,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 70%[=============> ] 1.12G 8.79MB/s eta 1m 54s " + "Epoch 0: 7%| | 1028/14932 [01:38<22:13, 10.43it/s, v_num=o30c, train/loss=3.11\r", + "Epoch 0: 7%| | 1028/14932 [01:38<22:13, 10.43it/s, v_num=o30c, train/loss=3.55" ] }, { @@ -9313,7 +10495,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 70%[=============> ] 1.13G 8.39MB/s eta 1m 54s " + "Epoch 0: 7%| | 1029/14932 [01:38<22:12, 10.43it/s, v_num=o30c, train/loss=3.55\r", + "Epoch 0: 7%| | 1029/14932 [01:38<22:12, 10.43it/s, v_num=o30c, train/loss=3.42" ] }, { @@ -9321,7 +10504,8 @@ "output_type": "stream", "text": [ "\r", - "pth 70%[=============> ] 1.13G 7.98MB/s eta 1m 54s " + "Epoch 0: 7%| | 1030/14932 [01:38<22:12, 10.43it/s, v_num=o30c, train/loss=3.42\r", + "Epoch 0: 7%| | 1030/14932 [01:38<22:12, 10.43it/s, v_num=o30c, train/loss=2.55" ] }, { @@ -9329,7 +10513,8 @@ "output_type": "stream", "text": [ "\r", - "th 70%[=============> ] 1.13G 7.58MB/s eta 1m 54s " + "Epoch 0: 7%| | 1031/14932 [01:38<22:11, 10.44it/s, v_num=o30c, train/loss=2.55\r", + "Epoch 0: 7%| | 1031/14932 [01:38<22:11, 10.44it/s, v_num=o30c, train/loss=3.45" ] }, { @@ -9337,7 +10522,8 @@ "output_type": "stream", "text": [ "\r", - "h 70%[=============> ] 1.13G 8.05MB/s eta 1m 54s " + "Epoch 0: 7%| | 1032/14932 [01:38<22:11, 10.44it/s, v_num=o30c, train/loss=3.45\r", + "Epoch 0: 7%| | 1032/14932 [01:38<22:11, 10.44it/s, v_num=o30c, train/loss=3.12" ] }, { @@ -9345,7 +10531,8 @@ "output_type": "stream", "text": [ "\r", - " 70%[=============> ] 1.13G 7.63MB/s eta 1m 52s " + "Epoch 0: 7%| | 1033/14932 [01:38<22:11, 10.44it/s, v_num=o30c, train/loss=3.12\r", + "Epoch 0: 7%| | 1033/14932 [01:38<22:11, 10.44it/s, v_num=o30c, train/loss=2.95" ] }, { @@ -9353,7 +10540,8 @@ "output_type": "stream", "text": [ "\r", - " v 70%[=============> ] 1.14G 7.39MB/s eta 1m 52s " + "Epoch 0: 7%| | 1034/14932 [01:39<22:10, 10.44it/s, v_num=o30c, train/loss=2.95\r", + "Epoch 0: 7%| | 1034/14932 [01:39<22:10, 10.44it/s, v_num=o30c, train/loss=3.67" ] }, { @@ -9361,7 +10549,8 @@ "output_type": "stream", "text": [ "\r", - " v5 70%[=============> ] 1.14G 7.30MB/s eta 1m 52s " + "Epoch 0: 7%| | 1035/14932 [01:39<22:10, 10.45it/s, v_num=o30c, train/loss=3.67\r", + "Epoch 0: 7%| | 1035/14932 [01:39<22:10, 10.45it/s, v_num=o30c, train/loss=3.36" ] }, { @@ -9369,7 +10558,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 71%[=============> ] 1.14G 7.55MB/s eta 1m 52s " + "Epoch 0: 7%| | 1036/14932 [01:39<22:10, 10.45it/s, v_num=o30c, train/loss=3.36\r", + "Epoch 0: 7%| | 1036/14932 [01:39<22:10, 10.45it/s, v_num=o30c, train/loss=2.78" ] }, { @@ -9377,7 +10567,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 71%[=============> ] 1.14G 7.13MB/s eta 1m 52s " + "Epoch 0: 7%| | 1037/14932 [01:39<22:09, 10.45it/s, v_num=o30c, train/loss=2.78\r", + "Epoch 0: 7%| | 1037/14932 [01:39<22:09, 10.45it/s, v_num=o30c, train/loss=3.61" ] }, { @@ -9385,7 +10576,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 71%[=============> ] 1.14G 7.33MB/s eta 1m 49s " + "Epoch 0: 7%| | 1038/14932 [01:39<22:09, 10.45it/s, v_num=o30c, train/loss=3.61\r", + "Epoch 0: 7%| | 1038/14932 [01:39<22:09, 10.45it/s, v_num=o30c, train/loss=2.33" ] }, { @@ -9393,7 +10585,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 71%[=============> ] 1.14G 7.35MB/s eta 1m 49s " + "Epoch 0: 7%| | 1039/14932 [01:39<22:08, 10.46it/s, v_num=o30c, train/loss=2.33\r", + "Epoch 0: 7%| | 1039/14932 [01:39<22:08, 10.46it/s, v_num=o30c, train/loss=3.34" ] }, { @@ -9401,7 +10594,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 71%[=============> ] 1.15G 7.16MB/s eta 1m 49s " + "Epoch 0: 7%| | 1040/14932 [01:39<22:08, 10.46it/s, v_num=o30c, train/loss=3.34\r", + "Epoch 0: 7%| | 1040/14932 [01:39<22:08, 10.46it/s, v_num=o30c, train/loss=3.91" ] }, { @@ -9409,7 +10603,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 71%[=============> ] 1.15G 7.22MB/s eta 1m 49s " + "Epoch 0: 7%| | 1041/14932 [01:39<22:07, 10.46it/s, v_num=o30c, train/loss=3.91\r", + "Epoch 0: 7%| | 1041/14932 [01:39<22:07, 10.46it/s, v_num=o30c, train/loss=2.92" ] }, { @@ -9417,7 +10612,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 71%[=============> ] 1.15G 7.27MB/s eta 1m 49s " + "Epoch 0: 7%| | 1042/14932 [01:39<22:07, 10.46it/s, v_num=o30c, train/loss=2.92\r", + "Epoch 0: 7%| | 1042/14932 [01:39<22:07, 10.46it/s, v_num=o30c, train/loss=3.59" ] }, { @@ -9425,7 +10621,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 71%[=============> ] 1.15G 7.32MB/s eta 1m 47s " + "Epoch 0: 7%| | 1043/14932 [01:39<22:07, 10.46it/s, v_num=o30c, train/loss=3.59" ] }, { @@ -9433,7 +10629,7 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 71%[=============> ] 1.15G 7.36MB/s eta 1m 47s " + "Epoch 0: 7%| | 1043/14932 [01:39<22:07, 10.46it/s, v_num=o30c, train/loss=3.56" ] }, { @@ -9441,7 +10637,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 72%[=============> ] 1.15G 7.41MB/s eta 1m 47s " + "Epoch 0: 7%| | 1044/14932 [01:39<22:06, 10.47it/s, v_num=o30c, train/loss=3.56\r", + "Epoch 0: 7%| | 1044/14932 [01:39<22:06, 10.47it/s, v_num=o30c, train/loss=3.14" ] }, { @@ -9449,7 +10646,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 72%[=============> ] 1.16G 7.44MB/s eta 1m 47s " + "Epoch 0: 7%| | 1045/14932 [01:39<22:06, 10.47it/s, v_num=o30c, train/loss=3.14\r", + "Epoch 0: 7%| | 1045/14932 [01:39<22:06, 10.47it/s, v_num=o30c, train/loss=2.38" ] }, { @@ -9457,7 +10655,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 72%[=============> ] 1.16G 7.48MB/s eta 1m 47s " + "Epoch 0: 7%| | 1046/14932 [01:39<22:05, 10.47it/s, v_num=o30c, train/loss=2.38\r", + "Epoch 0: 7%| | 1046/14932 [01:39<22:05, 10.47it/s, v_num=o30c, train/loss=4.31" ] }, { @@ -9465,7 +10664,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 72%[=============> ] 1.16G 7.51MB/s eta 1m 44s " + "Epoch 0: 7%| | 1047/14932 [01:39<22:05, 10.47it/s, v_num=o30c, train/loss=4.31\r", + "Epoch 0: 7%| | 1047/14932 [01:39<22:05, 10.47it/s, v_num=o30c, train/loss=2.48" ] }, { @@ -9473,7 +10673,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 72%[=============> ] 1.16G 7.54MB/s eta 1m 44s " + "Epoch 0: 7%| | 1048/14932 [01:40<22:05, 10.48it/s, v_num=o30c, train/loss=2.48\r", + "Epoch 0: 7%| | 1048/14932 [01:40<22:05, 10.48it/s, v_num=o30c, train/loss=3.91" ] }, { @@ -9481,7 +10682,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 72%[=============> ] 1.16G 7.56MB/s eta 1m 44s " + "Epoch 0: 7%| | 1049/14932 [01:40<22:04, 10.48it/s, v_num=o30c, train/loss=3.91\r", + "Epoch 0: 7%| | 1049/14932 [01:40<22:04, 10.48it/s, v_num=o30c, train/loss=4.12" ] }, { @@ -9489,7 +10691,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 72%[=============> ] 1.17G 7.58MB/s eta 1m 44s " + "Epoch 0: 7%| | 1050/14932 [01:40<22:05, 10.48it/s, v_num=o30c, train/loss=4.12\r", + "Epoch 0: 7%| | 1050/14932 [01:40<22:05, 10.48it/s, v_num=o30c, train/loss=3.11" ] }, { @@ -9497,7 +10700,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 72%[=============> ] 1.17G 7.60MB/s eta 1m 44s " + "Epoch 0: 7%| | 1051/14932 [01:40<22:04, 10.48it/s, v_num=o30c, train/loss=3.11\r", + "Epoch 0: 7%| | 1051/14932 [01:40<22:04, 10.48it/s, v_num=o30c, train/loss=3.00" ] }, { @@ -9505,7 +10709,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 72%[=============> ] 1.17G 7.61MB/s eta 1m 42s " + "Epoch 0: 7%| | 1052/14932 [01:40<22:04, 10.48it/s, v_num=o30c, train/loss=3.00\r", + "Epoch 0: 7%| | 1052/14932 [01:40<22:04, 10.48it/s, v_num=o30c, train/loss=3.31" ] }, { @@ -9513,7 +10718,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 73%[=============> ] 1.17G 7.63MB/s eta 1m 42s " + "Epoch 0: 7%| | 1053/14932 [01:40<22:04, 10.48it/s, v_num=o30c, train/loss=3.31\r", + "Epoch 0: 7%| | 1053/14932 [01:40<22:04, 10.48it/s, v_num=o30c, train/loss=4.81" ] }, { @@ -9521,7 +10727,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 73%[=============> ] 1.17G 7.64MB/s eta 1m 42s " + "Epoch 0: 7%| | 1054/14932 [01:40<22:03, 10.49it/s, v_num=o30c, train/loss=4.81\r", + "Epoch 0: 7%| | 1054/14932 [01:40<22:03, 10.48it/s, v_num=o30c, train/loss=5.22" ] }, { @@ -9529,7 +10736,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 73%[=============> ] 1.17G 7.65MB/s eta 1m 42s " + "Epoch 0: 7%| | 1055/14932 [01:40<22:03, 10.49it/s, v_num=o30c, train/loss=5.22\r", + "Epoch 0: 7%| | 1055/14932 [01:40<22:03, 10.49it/s, v_num=o30c, train/loss=2.97" ] }, { @@ -9537,7 +10745,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 73%[=============> ] 1.18G 7.66MB/s eta 1m 42s " + "Epoch 0: 7%| | 1056/14932 [01:40<22:04, 10.48it/s, v_num=o30c, train/loss=2.97\r", + "Epoch 0: 7%| | 1056/14932 [01:40<22:04, 10.48it/s, v_num=o30c, train/loss=2.53" ] }, { @@ -9545,7 +10754,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 73%[=============> ] 1.18G 7.70MB/s eta 1m 40s " + "Epoch 0: 7%| | 1057/14932 [01:40<22:03, 10.48it/s, v_num=o30c, train/loss=2.53\r", + "Epoch 0: 7%| | 1057/14932 [01:40<22:03, 10.48it/s, v_num=o30c, train/loss=3.84" ] }, { @@ -9553,7 +10763,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 73%[=============> ] 1.18G 7.67MB/s eta 1m 40s " + "Epoch 0: 7%| | 1058/14932 [01:40<22:03, 10.48it/s, v_num=o30c, train/loss=3.84\r", + "Epoch 0: 7%| | 1058/14932 [01:40<22:03, 10.48it/s, v_num=o30c, train/loss=1.61" ] }, { @@ -9561,7 +10772,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 73%[=============> ] 1.18G 7.68MB/s eta 1m 40s " + "Epoch 0: 7%| | 1059/14932 [01:40<22:02, 10.49it/s, v_num=o30c, train/loss=1.61\r", + "Epoch 0: 7%| | 1059/14932 [01:40<22:02, 10.49it/s, v_num=o30c, train/loss=3.58" ] }, { @@ -9569,7 +10781,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 73%[=============> ] 1.18G 7.68MB/s eta 1m 40s " + "Epoch 0: 7%| | 1060/14932 [01:41<22:02, 10.49it/s, v_num=o30c, train/loss=3.58\r", + "Epoch 0: 7%| | 1060/14932 [01:41<22:02, 10.49it/s, v_num=o30c, train/loss=3.09" ] }, { @@ -9577,7 +10790,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 73%[=============> ] 1.18G 7.70MB/s eta 1m 40s " + "Epoch 0: 7%| | 1061/14932 [01:41<22:02, 10.49it/s, v_num=o30c, train/loss=3.09\r", + "Epoch 0: 7%| | 1061/14932 [01:41<22:02, 10.49it/s, v_num=o30c, train/loss=3.27" ] }, { @@ -9585,7 +10799,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 74%[=============> ] 1.19G 7.70MB/s eta 97s " + "Epoch 0: 7%| | 1062/14932 [01:41<22:01, 10.49it/s, v_num=o30c, train/loss=3.27\r", + "Epoch 0: 7%| | 1062/14932 [01:41<22:01, 10.49it/s, v_num=o30c, train/loss=2.14" ] }, { @@ -9593,7 +10808,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 74%[=============> ] 1.19G 7.71MB/s eta 97s " + "Epoch 0: 7%| | 1063/14932 [01:41<22:02, 10.49it/s, v_num=o30c, train/loss=2.14\r", + "Epoch 0: 7%| | 1063/14932 [01:41<22:02, 10.49it/s, v_num=o30c, train/loss=4.62" ] }, { @@ -9601,7 +10817,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 74%[=============> ] 1.19G 7.71MB/s eta 97s " + "Epoch 0: 7%| | 1064/14932 [01:41<22:01, 10.49it/s, v_num=o30c, train/loss=4.62\r", + "Epoch 0: 7%| | 1064/14932 [01:41<22:01, 10.49it/s, v_num=o30c, train/loss=3.70" ] }, { @@ -9609,7 +10826,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 74%[=============> ] 1.19G 7.71MB/s eta 97s " + "Epoch 0: 7%| | 1065/14932 [01:41<22:01, 10.50it/s, v_num=o30c, train/loss=3.70\r", + "Epoch 0: 7%| | 1065/14932 [01:41<22:01, 10.50it/s, v_num=o30c, train/loss=3.61" ] }, { @@ -9617,7 +10835,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 74%[=============> ] 1.19G 7.71MB/s eta 97s " + "Epoch 0: 7%| | 1066/14932 [01:41<22:01, 10.49it/s, v_num=o30c, train/loss=3.61\r", + "Epoch 0: 7%| | 1066/14932 [01:41<22:01, 10.49it/s, v_num=o30c, train/loss=2.95" ] }, { @@ -9625,7 +10844,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 74%[=============> ] 1.20G 7.72MB/s eta 95s " + "Epoch 0: 7%| | 1067/14932 [01:41<22:01, 10.49it/s, v_num=o30c, train/loss=2.95\r", + "Epoch 0: 7%| | 1067/14932 [01:41<22:01, 10.49it/s, v_num=o30c, train/loss=3.47" ] }, { @@ -9633,7 +10853,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 74%[=============> ] 1.20G 7.72MB/s eta 95s " + "Epoch 0: 7%| | 1068/14932 [01:41<22:00, 10.50it/s, v_num=o30c, train/loss=3.47\r", + "Epoch 0: 7%| | 1068/14932 [01:41<22:00, 10.50it/s, v_num=o30c, train/loss=3.81" ] }, { @@ -9641,7 +10862,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 74%[=============> ] 1.20G 7.73MB/s eta 95s " + "Epoch 0: 7%| | 1069/14932 [01:41<22:00, 10.50it/s, v_num=o30c, train/loss=3.81\r", + "Epoch 0: 7%| | 1069/14932 [01:41<22:00, 10.50it/s, v_num=o30c, train/loss=3.72" ] }, { @@ -9649,7 +10871,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 74%[=============> ] 1.20G 7.74MB/s eta 95s " + "Epoch 0: 7%| | 1070/14932 [01:41<22:00, 10.50it/s, v_num=o30c, train/loss=3.72\r", + "Epoch 0: 7%| | 1070/14932 [01:41<22:00, 10.50it/s, v_num=o30c, train/loss=4.12" ] }, { @@ -9657,7 +10880,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 75%[==============> ] 1.20G 7.75MB/s eta 95s " + "Epoch 0: 7%| | 1071/14932 [01:41<21:59, 10.50it/s, v_num=o30c, train/loss=4.12\r", + "Epoch 0: 7%| | 1071/14932 [01:41<22:00, 10.50it/s, v_num=o30c, train/loss=4.47" ] }, { @@ -9665,7 +10889,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 75%[==============> ] 1.20G 7.75MB/s eta 92s " + "Epoch 0: 7%| | 1072/14932 [01:42<21:59, 10.50it/s, v_num=o30c, train/loss=4.47\r", + "Epoch 0: 7%| | 1072/14932 [01:42<21:59, 10.50it/s, v_num=o30c, train/loss=3.47" ] }, { @@ -9673,7 +10898,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 75%[==============> ] 1.21G 7.76MB/s eta 92s " + "Epoch 0: 7%| | 1073/14932 [01:42<21:59, 10.51it/s, v_num=o30c, train/loss=3.47\r", + "Epoch 0: 7%| | 1073/14932 [01:42<21:59, 10.51it/s, v_num=o30c, train/loss=3.52" ] }, { @@ -9681,7 +10907,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 75%[==============> ] 1.21G 7.75MB/s eta 92s " + "Epoch 0: 7%| | 1074/14932 [01:42<21:58, 10.51it/s, v_num=o30c, train/loss=3.52\r", + "Epoch 0: 7%| | 1074/14932 [01:42<21:58, 10.51it/s, v_num=o30c, train/loss=2.88" ] }, { @@ -9689,7 +10916,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 75%[==============> ] 1.21G 7.59MB/s eta 92s " + "Epoch 0: 7%| | 1075/14932 [01:42<21:58, 10.51it/s, v_num=o30c, train/loss=2.88\r", + "Epoch 0: 7%| | 1075/14932 [01:42<21:58, 10.51it/s, v_num=o30c, train/loss=3.09" ] }, { @@ -9697,7 +10925,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 75%[==============> ] 1.21G 7.88MB/s eta 90s " + "Epoch 0: 7%| | 1076/14932 [01:42<21:58, 10.51it/s, v_num=o30c, train/loss=3.09\r", + "Epoch 0: 7%| | 1076/14932 [01:42<21:58, 10.51it/s, v_num=o30c, train/loss=3.39" ] }, { @@ -9705,7 +10934,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 75%[==============> ] 1.21G 7.76MB/s eta 90s " + "Epoch 0: 7%| | 1077/14932 [01:42<21:57, 10.51it/s, v_num=o30c, train/loss=3.39\r", + "Epoch 0: 7%| | 1077/14932 [01:42<21:57, 10.51it/s, v_num=o30c, train/loss=3.28" ] }, { @@ -9713,7 +10943,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 75%[==============> ] 1.21G 7.55MB/s eta 90s " + "Epoch 0: 7%| | 1078/14932 [01:42<21:57, 10.52it/s, v_num=o30c, train/loss=3.28\r", + "Epoch 0: 7%| | 1078/14932 [01:42<21:57, 10.52it/s, v_num=o30c, train/loss=3.17" ] }, { @@ -9721,7 +10952,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 75%[==============> ] 1.22G 7.38MB/s eta 90s " + "Epoch 0: 7%| | 1079/14932 [01:42<21:56, 10.52it/s, v_num=o30c, train/loss=3.17\r", + "Epoch 0: 7%| | 1079/14932 [01:42<21:56, 10.52it/s, v_num=o30c, train/loss=4.12" ] }, { @@ -9729,7 +10961,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 76%[==============> ] 1.22G 7.16MB/s eta 90s " + "Epoch 0: 7%| | 1080/14932 [01:42<21:56, 10.52it/s, v_num=o30c, train/loss=4.12\r", + "Epoch 0: 7%| | 1080/14932 [01:42<21:56, 10.52it/s, v_num=o30c, train/loss=3.80" ] }, { @@ -9737,7 +10970,8 @@ "output_type": "stream", "text": [ "\r", - "pth 76%[==============> ] 1.22G 7.43MB/s eta 89s " + "Epoch 0: 7%| | 1081/14932 [01:42<21:56, 10.52it/s, v_num=o30c, train/loss=3.80\r", + "Epoch 0: 7%| | 1081/14932 [01:42<21:56, 10.52it/s, v_num=o30c, train/loss=2.11" ] }, { @@ -9745,7 +10979,8 @@ "output_type": "stream", "text": [ "\r", - "th 76%[==============> ] 1.22G 7.16MB/s eta 89s " + "Epoch 0: 7%| | 1082/14932 [01:42<21:55, 10.53it/s, v_num=o30c, train/loss=2.11\r", + "Epoch 0: 7%| | 1082/14932 [01:42<21:55, 10.53it/s, v_num=o30c, train/loss=2.36" ] }, { @@ -9753,7 +10988,8 @@ "output_type": "stream", "text": [ "\r", - "h 76%[==============> ] 1.22G 7.14MB/s eta 89s " + "Epoch 0: 7%| | 1083/14932 [01:42<21:55, 10.53it/s, v_num=o30c, train/loss=2.36\r", + "Epoch 0: 7%| | 1083/14932 [01:42<21:55, 10.53it/s, v_num=o30c, train/loss=3.44" ] }, { @@ -9761,7 +10997,8 @@ "output_type": "stream", "text": [ "\r", - " 76%[==============> ] 1.22G 6.97MB/s eta 89s " + "Epoch 0: 7%| | 1084/14932 [01:42<21:54, 10.53it/s, v_num=o30c, train/loss=3.44\r", + "Epoch 0: 7%| | 1084/14932 [01:42<21:54, 10.53it/s, v_num=o30c, train/loss=2.94" ] }, { @@ -9769,7 +11006,8 @@ "output_type": "stream", "text": [ "\r", - " v 76%[==============> ] 1.22G 6.79MB/s eta 89s " + "Epoch 0: 7%| | 1085/14932 [01:42<21:54, 10.53it/s, v_num=o30c, train/loss=2.94\r", + "Epoch 0: 7%| | 1085/14932 [01:42<21:54, 10.53it/s, v_num=o30c, train/loss=2.27" ] }, { @@ -9777,7 +11015,8 @@ "output_type": "stream", "text": [ "\r", - " v5 76%[==============> ] 1.23G 6.64MB/s eta 87s " + "Epoch 0: 7%| | 1086/14932 [01:43<21:54, 10.54it/s, v_num=o30c, train/loss=2.27\r", + "Epoch 0: 7%| | 1086/14932 [01:43<21:54, 10.54it/s, v_num=o30c, train/loss=2.55" ] }, { @@ -9785,7 +11024,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 76%[==============> ] 1.23G 6.90MB/s eta 87s " + "Epoch 0: 7%| | 1087/14932 [01:43<21:53, 10.54it/s, v_num=o30c, train/loss=2.55\r", + "Epoch 0: 7%| | 1087/14932 [01:43<21:53, 10.54it/s, v_num=o30c, train/loss=2.72" ] }, { @@ -9793,7 +11033,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 76%[==============> ] 1.23G 6.85MB/s eta 87s " + "Epoch 0: 7%| | 1088/14932 [01:43<21:55, 10.52it/s, v_num=o30c, train/loss=2.72\r", + "Epoch 0: 7%| | 1088/14932 [01:43<21:55, 10.52it/s, v_num=o30c, train/loss=3.36" ] }, { @@ -9801,7 +11042,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 76%[==============> ] 1.23G 6.56MB/s eta 87s " + "Epoch 0: 7%| | 1089/14932 [01:43<21:54, 10.53it/s, v_num=o30c, train/loss=3.36\r", + "Epoch 0: 7%| | 1089/14932 [01:43<21:54, 10.53it/s, v_num=o30c, train/loss=3.58" ] }, { @@ -9809,7 +11051,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 76%[==============> ] 1.23G 6.39MB/s eta 87s " + "Epoch 0: 7%| | 1090/14932 [01:43<21:54, 10.53it/s, v_num=o30c, train/loss=3.58\r", + "Epoch 0: 7%| | 1090/14932 [01:43<21:54, 10.53it/s, v_num=o30c, train/loss=3.31" ] }, { @@ -9817,7 +11060,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 77%[==============> ] 1.23G 6.72MB/s eta 85s " + "Epoch 0: 7%| | 1091/14932 [01:43<21:54, 10.53it/s, v_num=o30c, train/loss=3.31\r", + "Epoch 0: 7%| | 1091/14932 [01:43<21:54, 10.53it/s, v_num=o30c, train/loss=2.86" ] }, { @@ -9825,7 +11069,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 77%[==============> ] 1.24G 6.60MB/s eta 85s " + "Epoch 0: 7%| | 1092/14932 [01:43<21:53, 10.53it/s, v_num=o30c, train/loss=2.86\r", + "Epoch 0: 7%| | 1092/14932 [01:43<21:53, 10.53it/s, v_num=o30c, train/loss=1.64" ] }, { @@ -9833,7 +11078,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 77%[==============> ] 1.24G 6.74MB/s eta 85s " + "Epoch 0: 7%| | 1093/14932 [01:43<21:53, 10.54it/s, v_num=o30c, train/loss=1.64\r", + "Epoch 0: 7%| | 1093/14932 [01:43<21:53, 10.54it/s, v_num=o30c, train/loss=3.66" ] }, { @@ -9841,7 +11087,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 77%[==============> ] 1.24G 6.93MB/s eta 85s " + "Epoch 0: 7%| | 1094/14932 [01:43<21:52, 10.54it/s, v_num=o30c, train/loss=3.66\r", + "Epoch 0: 7%| | 1094/14932 [01:43<21:52, 10.54it/s, v_num=o30c, train/loss=3.23" ] }, { @@ -9849,7 +11096,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 77%[==============> ] 1.24G 6.97MB/s eta 85s " + "Epoch 0: 7%| | 1095/14932 [01:43<21:52, 10.54it/s, v_num=o30c, train/loss=3.23\r", + "Epoch 0: 7%| | 1095/14932 [01:43<21:52, 10.54it/s, v_num=o30c, train/loss=2.78" ] }, { @@ -9857,7 +11105,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 77%[==============> ] 1.24G 6.98MB/s eta 83s " + "Epoch 0: 7%| | 1096/14932 [01:43<21:52, 10.54it/s, v_num=o30c, train/loss=2.78\r", + "Epoch 0: 7%| | 1096/14932 [01:43<21:52, 10.54it/s, v_num=o30c, train/loss=3.69" ] }, { @@ -9865,7 +11114,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 77%[==============> ] 1.25G 7.20MB/s eta 83s " + "Epoch 0: 7%| | 1097/14932 [01:44<21:51, 10.55it/s, v_num=o30c, train/loss=3.69\r", + "Epoch 0: 7%| | 1097/14932 [01:44<21:51, 10.55it/s, v_num=o30c, train/loss=2.80" ] }, { @@ -9873,7 +11123,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 77%[==============> ] 1.25G 7.22MB/s eta 83s " + "Epoch 0: 7%| | 1098/14932 [01:44<21:51, 10.55it/s, v_num=o30c, train/loss=2.80\r", + "Epoch 0: 7%| | 1098/14932 [01:44<21:51, 10.55it/s, v_num=o30c, train/loss=2.95" ] }, { @@ -9881,7 +11132,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 77%[==============> ] 1.25G 6.76MB/s eta 83s " + "Epoch 0: 7%| | 1099/14932 [01:44<21:51, 10.54it/s, v_num=o30c, train/loss=2.95\r", + "Epoch 0: 7%| | 1099/14932 [01:44<21:52, 10.54it/s, v_num=o30c, train/loss=3.08" ] }, { @@ -9889,7 +11141,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 78%[==============> ] 1.25G 7.17MB/s eta 81s " + "Epoch 0: 7%| | 1100/14932 [01:44<21:51, 10.55it/s, v_num=o30c, train/loss=3.08\r", + "Epoch 0: 7%| | 1100/14932 [01:44<21:51, 10.55it/s, v_num=o30c, train/loss=3.66" ] }, { @@ -9897,7 +11150,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 78%[==============> ] 1.25G 6.95MB/s eta 81s " + "Epoch 0: 7%| | 1101/14932 [01:44<21:51, 10.55it/s, v_num=o30c, train/loss=3.66\r", + "Epoch 0: 7%| | 1101/14932 [01:44<21:51, 10.55it/s, v_num=o30c, train/loss=3.92" ] }, { @@ -9905,7 +11159,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 78%[==============> ] 1.25G 7.04MB/s eta 81s " + "Epoch 0: 7%| | 1102/14932 [01:44<21:50, 10.55it/s, v_num=o30c, train/loss=3.92\r", + "Epoch 0: 7%| | 1102/14932 [01:44<21:50, 10.55it/s, v_num=o30c, train/loss=3.23" ] }, { @@ -9913,7 +11168,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 78%[==============> ] 1.25G 6.91MB/s eta 81s " + "Epoch 0: 7%| | 1103/14932 [01:44<21:50, 10.55it/s, v_num=o30c, train/loss=3.23\r", + "Epoch 0: 7%| | 1103/14932 [01:44<21:50, 10.55it/s, v_num=o30c, train/loss=1.77" ] }, { @@ -9921,7 +11177,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 78%[==============> ] 1.25G 6.69MB/s eta 81s " + "Epoch 0: 7%| | 1104/14932 [01:44<21:50, 10.55it/s, v_num=o30c, train/loss=1.77\r", + "Epoch 0: 7%| | 1104/14932 [01:44<21:50, 10.55it/s, v_num=o30c, train/loss=3.92" ] }, { @@ -9929,7 +11186,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 78%[==============> ] 1.26G 6.76MB/s eta 79s " + "Epoch 0: 7%| | 1105/14932 [01:44<21:49, 10.56it/s, v_num=o30c, train/loss=3.92\r", + "Epoch 0: 7%| | 1105/14932 [01:44<21:49, 10.56it/s, v_num=o30c, train/loss=4.00" ] }, { @@ -9937,7 +11195,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 78%[==============> ] 1.26G 6.60MB/s eta 79s " + "Epoch 0: 7%| | 1106/14932 [01:44<21:49, 10.56it/s, v_num=o30c, train/loss=4.00\r", + "Epoch 0: 7%| | 1106/14932 [01:44<21:49, 10.56it/s, v_num=o30c, train/loss=3.14" ] }, { @@ -9945,7 +11204,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 78%[==============> ] 1.26G 6.59MB/s eta 79s " + "Epoch 0: 7%| | 1107/14932 [01:44<21:49, 10.56it/s, v_num=o30c, train/loss=3.14\r", + "Epoch 0: 7%| | 1107/14932 [01:44<21:49, 10.56it/s, v_num=o30c, train/loss=2.81" ] }, { @@ -9953,7 +11213,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 78%[==============> ] 1.26G 6.51MB/s eta 79s " + "Epoch 0: 7%| | 1108/14932 [01:44<21:48, 10.56it/s, v_num=o30c, train/loss=2.81\r", + "Epoch 0: 7%| | 1108/14932 [01:44<21:48, 10.56it/s, v_num=o30c, train/loss=4.34" ] }, { @@ -9961,7 +11222,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 78%[==============> ] 1.26G 6.29MB/s eta 79s " + "Epoch 0: 7%| | 1109/14932 [01:44<21:48, 10.57it/s, v_num=o30c, train/loss=4.34\r", + "Epoch 0: 7%| | 1109/14932 [01:44<21:48, 10.57it/s, v_num=o30c, train/loss=4.12" ] }, { @@ -9969,7 +11231,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 78%[==============> ] 1.26G 6.35MB/s eta 78s " + "Epoch 0: 7%| | 1110/14932 [01:45<21:47, 10.57it/s, v_num=o30c, train/loss=4.12\r", + "Epoch 0: 7%| | 1110/14932 [01:45<21:47, 10.57it/s, v_num=o30c, train/loss=3.94" ] }, { @@ -9977,7 +11240,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 78%[==============> ] 1.26G 5.71MB/s eta 78s " + "Epoch 0: 7%| | 1111/14932 [01:45<21:47, 10.57it/s, v_num=o30c, train/loss=3.94\r", + "Epoch 0: 7%| | 1111/14932 [01:45<21:47, 10.57it/s, v_num=o30c, train/loss=2.95" ] }, { @@ -9985,7 +11249,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 79%[==============> ] 1.27G 6.10MB/s eta 78s " + "Epoch 0: 7%| | 1112/14932 [01:45<21:46, 10.57it/s, v_num=o30c, train/loss=2.95\r", + "Epoch 0: 7%| | 1112/14932 [01:45<21:46, 10.57it/s, v_num=o30c, train/loss=3.11" ] }, { @@ -9993,7 +11258,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 79%[==============> ] 1.27G 6.11MB/s eta 78s " + "Epoch 0: 7%| | 1113/14932 [01:45<21:46, 10.58it/s, v_num=o30c, train/loss=3.11\r", + "Epoch 0: 7%| | 1113/14932 [01:45<21:46, 10.58it/s, v_num=o30c, train/loss=3.55" ] }, { @@ -10001,7 +11267,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 79%[==============> ] 1.27G 5.80MB/s eta 78s " + "Epoch 0: 7%| | 1114/14932 [01:45<21:46, 10.58it/s, v_num=o30c, train/loss=3.55\r", + "Epoch 0: 7%| | 1114/14932 [01:45<21:46, 10.58it/s, v_num=o30c, train/loss=3.30" ] }, { @@ -10009,7 +11276,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 79%[==============> ] 1.27G 6.01MB/s eta 76s " + "Epoch 0: 7%| | 1115/14932 [01:45<21:45, 10.58it/s, v_num=o30c, train/loss=3.30\r", + "Epoch 0: 7%| | 1115/14932 [01:45<21:45, 10.58it/s, v_num=o30c, train/loss=3.52" ] }, { @@ -10017,7 +11285,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 79%[==============> ] 1.27G 5.55MB/s eta 76s " + "Epoch 0: 7%| | 1116/14932 [01:45<21:45, 10.59it/s, v_num=o30c, train/loss=3.52\r", + "Epoch 0: 7%| | 1116/14932 [01:45<21:45, 10.59it/s, v_num=o30c, train/loss=3.31" ] }, { @@ -10025,7 +11294,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 79%[==============> ] 1.27G 5.46MB/s eta 76s " + "Epoch 0: 7%| | 1117/14932 [01:45<21:44, 10.59it/s, v_num=o30c, train/loss=3.31\r", + "Epoch 0: 7%| | 1117/14932 [01:45<21:44, 10.59it/s, v_num=o30c, train/loss=3.70" ] }, { @@ -10033,7 +11303,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 79%[==============> ] 1.27G 5.40MB/s eta 76s " + "Epoch 0: 7%| | 1118/14932 [01:45<21:44, 10.59it/s, v_num=o30c, train/loss=3.70\r", + "Epoch 0: 7%| | 1118/14932 [01:45<21:44, 10.59it/s, v_num=o30c, train/loss=3.70" ] }, { @@ -10041,7 +11312,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 79%[==============> ] 1.27G 5.16MB/s eta 76s " + "Epoch 0: 7%| | 1119/14932 [01:45<21:44, 10.59it/s, v_num=o30c, train/loss=3.70\r", + "Epoch 0: 7%| | 1119/14932 [01:45<21:44, 10.59it/s, v_num=o30c, train/loss=2.86" ] }, { @@ -10049,7 +11321,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 79%[==============> ] 1.27G 5.23MB/s eta 75s " + "Epoch 0: 8%| | 1120/14932 [01:45<21:45, 10.58it/s, v_num=o30c, train/loss=2.86\r", + "Epoch 0: 8%| | 1120/14932 [01:45<21:45, 10.58it/s, v_num=o30c, train/loss=4.97" ] }, { @@ -10057,7 +11330,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 79%[==============> ] 1.28G 5.03MB/s eta 75s " + "Epoch 0: 8%| | 1121/14932 [01:45<21:44, 10.59it/s, v_num=o30c, train/loss=4.97\r", + "Epoch 0: 8%| | 1121/14932 [01:45<21:44, 10.59it/s, v_num=o30c, train/loss=1.61" ] }, { @@ -10065,7 +11339,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 79%[==============> ] 1.28G 5.18MB/s eta 75s " + "Epoch 0: 8%| | 1122/14932 [01:45<21:44, 10.59it/s, v_num=o30c, train/loss=1.61\r", + "Epoch 0: 8%| | 1122/14932 [01:45<21:44, 10.59it/s, v_num=o30c, train/loss=3.77" ] }, { @@ -10073,7 +11348,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 79%[==============> ] 1.28G 4.98MB/s eta 75s " + "Epoch 0: 8%| | 1123/14932 [01:46<21:43, 10.59it/s, v_num=o30c, train/loss=3.77\r", + "Epoch 0: 8%| | 1123/14932 [01:46<21:44, 10.59it/s, v_num=o30c, train/loss=3.06" ] }, { @@ -10081,7 +11357,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 79%[==============> ] 1.28G 5.05MB/s eta 75s " + "Epoch 0: 8%| | 1124/14932 [01:46<21:43, 10.59it/s, v_num=o30c, train/loss=3.06\r", + "Epoch 0: 8%| | 1124/14932 [01:46<21:43, 10.59it/s, v_num=o30c, train/loss=3.39" ] }, { @@ -10089,7 +11366,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 79%[==============> ] 1.28G 4.88MB/s eta 74s " + "Epoch 0: 8%| | 1125/14932 [01:46<21:43, 10.59it/s, v_num=o30c, train/loss=3.39\r", + "Epoch 0: 8%| | 1125/14932 [01:46<21:43, 10.59it/s, v_num=o30c, train/loss=3.31" ] }, { @@ -10097,7 +11375,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 79%[==============> ] 1.28G 4.82MB/s eta 74s " + "Epoch 0: 8%| | 1126/14932 [01:46<21:42, 10.60it/s, v_num=o30c, train/loss=3.31\r", + "Epoch 0: 8%| | 1126/14932 [01:46<21:42, 10.60it/s, v_num=o30c, train/loss=4.50" ] }, { @@ -10105,7 +11384,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 80%[===============> ] 1.28G 4.88MB/s eta 74s " + "Epoch 0: 8%| | 1127/14932 [01:46<21:42, 10.60it/s, v_num=o30c, train/loss=4.50\r", + "Epoch 0: 8%| | 1127/14932 [01:46<21:42, 10.60it/s, v_num=o30c, train/loss=1.93" ] }, { @@ -10113,7 +11393,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 80%[===============> ] 1.28G 4.81MB/s eta 74s " + "Epoch 0: 8%| | 1128/14932 [01:46<21:43, 10.59it/s, v_num=o30c, train/loss=1.93\r", + "Epoch 0: 8%| | 1128/14932 [01:46<21:43, 10.59it/s, v_num=o30c, train/loss=2.94" ] }, { @@ -10121,7 +11402,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 80%[===============> ] 1.28G 4.89MB/s eta 74s " + "Epoch 0: 8%| | 1129/14932 [01:46<21:42, 10.60it/s, v_num=o30c, train/loss=2.94\r", + "Epoch 0: 8%| | 1129/14932 [01:46<21:42, 10.60it/s, v_num=o30c, train/loss=2.28" ] }, { @@ -10129,7 +11411,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 80%[===============> ] 1.29G 4.91MB/s eta 72s " + "Epoch 0: 8%| | 1130/14932 [01:46<21:42, 10.60it/s, v_num=o30c, train/loss=2.28\r", + "Epoch 0: 8%| | 1130/14932 [01:46<21:42, 10.60it/s, v_num=o30c, train/loss=3.58" ] }, { @@ -10137,7 +11420,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 80%[===============> ] 1.29G 4.97MB/s eta 72s " + "Epoch 0: 8%| | 1131/14932 [01:46<21:42, 10.59it/s, v_num=o30c, train/loss=3.58\r", + "Epoch 0: 8%| | 1131/14932 [01:46<21:42, 10.59it/s, v_num=o30c, train/loss=4.03" ] }, { @@ -10145,7 +11429,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 80%[===============> ] 1.29G 5.09MB/s eta 72s " + "Epoch 0: 8%| | 1132/14932 [01:46<21:42, 10.60it/s, v_num=o30c, train/loss=4.03\r", + "Epoch 0: 8%| | 1132/14932 [01:46<21:42, 10.60it/s, v_num=o30c, train/loss=3.28" ] }, { @@ -10153,7 +11438,8 @@ "output_type": "stream", "text": [ "\r", - "pth 80%[===============> ] 1.29G 5.06MB/s eta 72s " + "Epoch 0: 8%| | 1133/14932 [01:46<21:41, 10.60it/s, v_num=o30c, train/loss=3.28\r", + "Epoch 0: 8%| | 1133/14932 [01:46<21:41, 10.60it/s, v_num=o30c, train/loss=4.75" ] }, { @@ -10161,7 +11447,8 @@ "output_type": "stream", "text": [ "\r", - "th 80%[===============> ] 1.29G 5.28MB/s eta 72s " + "Epoch 0: 8%| | 1134/14932 [01:46<21:41, 10.60it/s, v_num=o30c, train/loss=4.75\r", + "Epoch 0: 8%| | 1134/14932 [01:46<21:41, 10.60it/s, v_num=o30c, train/loss=2.98" ] }, { @@ -10169,7 +11456,8 @@ "output_type": "stream", "text": [ "\r", - "h 80%[===============> ] 1.29G 5.21MB/s eta 71s " + "Epoch 0: 8%| | 1135/14932 [01:47<21:42, 10.59it/s, v_num=o30c, train/loss=2.98\r", + "Epoch 0: 8%| | 1135/14932 [01:47<21:42, 10.59it/s, v_num=o30c, train/loss=4.22" ] }, { @@ -10177,7 +11465,8 @@ "output_type": "stream", "text": [ "\r", - " 80%[===============> ] 1.29G 5.17MB/s eta 71s " + "Epoch 0: 8%| | 1136/14932 [01:47<21:42, 10.59it/s, v_num=o30c, train/loss=4.22\r", + "Epoch 0: 8%| | 1136/14932 [01:47<21:42, 10.59it/s, v_num=o30c, train/loss=2.05" ] }, { @@ -10185,7 +11474,8 @@ "output_type": "stream", "text": [ "\r", - " v 80%[===============> ] 1.29G 5.27MB/s eta 71s " + "Epoch 0: 8%| | 1137/14932 [01:47<21:41, 10.60it/s, v_num=o30c, train/loss=2.05\r", + "Epoch 0: 8%| | 1137/14932 [01:47<21:41, 10.60it/s, v_num=o30c, train/loss=3.03" ] }, { @@ -10193,7 +11483,8 @@ "output_type": "stream", "text": [ "\r", - " v5 80%[===============> ] 1.29G 4.79MB/s eta 71s " + "Epoch 0: 8%| | 1138/14932 [01:47<21:41, 10.60it/s, v_num=o30c, train/loss=3.03\r", + "Epoch 0: 8%| | 1138/14932 [01:47<21:41, 10.60it/s, v_num=o30c, train/loss=3.48" ] }, { @@ -10201,7 +11492,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 80%[===============> ] 1.30G 5.16MB/s eta 70s " + "Epoch 0: 8%| | 1139/14932 [01:47<21:41, 10.60it/s, v_num=o30c, train/loss=3.48\r", + "Epoch 0: 8%| | 1139/14932 [01:47<21:41, 10.60it/s, v_num=o30c, train/loss=2.70" ] }, { @@ -10209,7 +11501,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 80%[===============> ] 1.30G 4.94MB/s eta 70s " + "Epoch 0: 8%| | 1140/14932 [01:47<21:40, 10.60it/s, v_num=o30c, train/loss=2.70\r", + "Epoch 0: 8%| | 1140/14932 [01:47<21:40, 10.60it/s, v_num=o30c, train/loss=3.78" ] }, { @@ -10217,7 +11510,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 81%[===============> ] 1.30G 4.84MB/s eta 70s " + "Epoch 0: 8%| | 1141/14932 [01:47<21:40, 10.60it/s, v_num=o30c, train/loss=3.78\r", + "Epoch 0: 8%| | 1141/14932 [01:47<21:40, 10.60it/s, v_num=o30c, train/loss=2.77" ] }, { @@ -10225,7 +11519,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 81%[===============> ] 1.30G 4.91MB/s eta 70s " + "Epoch 0: 8%| | 1142/14932 [01:47<21:40, 10.61it/s, v_num=o30c, train/loss=2.77\r", + "Epoch 0: 8%| | 1142/14932 [01:47<21:40, 10.61it/s, v_num=o30c, train/loss=3.42" ] }, { @@ -10233,7 +11528,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 81%[===============> ] 1.30G 4.68MB/s eta 70s " + "Epoch 0: 8%| | 1143/14932 [01:47<21:39, 10.61it/s, v_num=o30c, train/loss=3.42\r", + "Epoch 0: 8%| | 1143/14932 [01:47<21:39, 10.61it/s, v_num=o30c, train/loss=2.62" ] }, { @@ -10241,7 +11537,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 81%[===============> ] 1.30G 4.68MB/s eta 69s " + "Epoch 0: 8%| | 1144/14932 [01:47<21:39, 10.61it/s, v_num=o30c, train/loss=2.62\r", + "Epoch 0: 8%| | 1144/14932 [01:47<21:39, 10.61it/s, v_num=o30c, train/loss=3.14" ] }, { @@ -10249,7 +11546,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 81%[===============> ] 1.30G 4.76MB/s eta 69s " + "Epoch 0: 8%| | 1145/14932 [01:47<21:38, 10.61it/s, v_num=o30c, train/loss=3.14\r", + "Epoch 0: 8%| | 1145/14932 [01:47<21:38, 10.61it/s, v_num=o30c, train/loss=4.41" ] }, { @@ -10257,7 +11555,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 81%[===============> ] 1.30G 4.54MB/s eta 69s " + "Epoch 0: 8%| | 1146/14932 [01:47<21:38, 10.62it/s, v_num=o30c, train/loss=4.41\r", + "Epoch 0: 8%| | 1146/14932 [01:47<21:38, 10.62it/s, v_num=o30c, train/loss=1.20" ] }, { @@ -10265,7 +11564,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 81%[===============> ] 1.30G 4.52MB/s eta 69s " + "Epoch 0: 8%| | 1147/14932 [01:48<21:38, 10.62it/s, v_num=o30c, train/loss=1.20\r", + "Epoch 0: 8%| | 1147/14932 [01:48<21:38, 10.62it/s, v_num=o30c, train/loss=0.98" ] }, { @@ -10273,7 +11573,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 81%[===============> ] 1.30G 4.49MB/s eta 69s " + "Epoch 0: 8%| | 1148/14932 [01:48<21:37, 10.62it/s, v_num=o30c, train/loss=0.98\r", + "Epoch 0: 8%| | 1148/14932 [01:48<21:37, 10.62it/s, v_num=o30c, train/loss=2.78" ] }, { @@ -10281,7 +11582,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 81%[===============> ] 1.31G 4.41MB/s eta 68s " + "Epoch 0: 8%| | 1149/14932 [01:48<21:38, 10.61it/s, v_num=o30c, train/loss=2.78\r", + "Epoch 0: 8%| | 1149/14932 [01:48<21:38, 10.61it/s, v_num=o30c, train/loss=2.58" ] }, { @@ -10289,7 +11591,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 81%[===============> ] 1.31G 4.42MB/s eta 68s " + "Epoch 0: 8%| | 1150/14932 [01:48<21:38, 10.61it/s, v_num=o30c, train/loss=2.58\r", + "Epoch 0: 8%| | 1150/14932 [01:48<21:38, 10.61it/s, v_num=o30c, train/loss=3.20" ] }, { @@ -10297,7 +11600,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 81%[===============> ] 1.31G 4.29MB/s eta 68s " + "Epoch 0: 8%| | 1151/14932 [01:48<21:38, 10.62it/s, v_num=o30c, train/loss=3.20\r", + "Epoch 0: 8%| | 1151/14932 [01:48<21:38, 10.62it/s, v_num=o30c, train/loss=2.23" ] }, { @@ -10305,7 +11609,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 81%[===============> ] 1.31G 4.34MB/s eta 68s " + "Epoch 0: 8%| | 1152/14932 [01:48<21:39, 10.61it/s, v_num=o30c, train/loss=2.23\r", + "Epoch 0: 8%| | 1152/14932 [01:48<21:39, 10.61it/s, v_num=o30c, train/loss=4.25" ] }, { @@ -10313,7 +11618,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 81%[===============> ] 1.31G 4.68MB/s eta 68s " + "Epoch 0: 8%| | 1153/14932 [01:48<21:38, 10.61it/s, v_num=o30c, train/loss=4.25\r", + "Epoch 0: 8%| | 1153/14932 [01:48<21:38, 10.61it/s, v_num=o30c, train/loss=2.94" ] }, { @@ -10321,7 +11627,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 81%[===============> ] 1.31G 4.47MB/s eta 66s " + "Epoch 0: 8%| | 1154/14932 [01:48<21:38, 10.61it/s, v_num=o30c, train/loss=2.94\r", + "Epoch 0: 8%| | 1154/14932 [01:48<21:38, 10.61it/s, v_num=o30c, train/loss=2.91" ] }, { @@ -10329,7 +11636,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 81%[===============> ] 1.31G 4.56MB/s eta 66s " + "Epoch 0: 8%| | 1155/14932 [01:48<21:38, 10.61it/s, v_num=o30c, train/loss=2.91\r", + "Epoch 0: 8%| | 1155/14932 [01:48<21:38, 10.61it/s, v_num=o30c, train/loss=4.38" ] }, { @@ -10337,7 +11645,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 82%[===============> ] 1.31G 4.52MB/s eta 66s " + "Epoch 0: 8%| | 1156/14932 [01:48<21:38, 10.61it/s, v_num=o30c, train/loss=4.38\r", + "Epoch 0: 8%| | 1156/14932 [01:48<21:38, 10.61it/s, v_num=o30c, train/loss=4.31" ] }, { @@ -10345,7 +11654,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 82%[===============> ] 1.32G 4.70MB/s eta 66s " + "Epoch 0: 8%| | 1157/14932 [01:49<21:38, 10.61it/s, v_num=o30c, train/loss=4.31\r", + "Epoch 0: 8%| | 1157/14932 [01:49<21:38, 10.61it/s, v_num=o30c, train/loss=2.12" ] }, { @@ -10353,7 +11663,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 82%[===============> ] 1.32G 4.76MB/s eta 66s " + "Epoch 0: 8%| | 1158/14932 [01:49<21:37, 10.61it/s, v_num=o30c, train/loss=2.12\r", + "Epoch 0: 8%| | 1158/14932 [01:49<21:37, 10.61it/s, v_num=o30c, train/loss=3.16" ] }, { @@ -10361,7 +11672,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 82%[===============> ] 1.32G 4.66MB/s eta 65s " + "Epoch 0: 8%| | 1159/14932 [01:49<21:37, 10.62it/s, v_num=o30c, train/loss=3.16\r", + "Epoch 0: 8%| | 1159/14932 [01:49<21:37, 10.62it/s, v_num=o30c, train/loss=2.80" ] }, { @@ -10369,7 +11681,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 82%[===============> ] 1.32G 4.88MB/s eta 65s " + "Epoch 0: 8%| | 1160/14932 [01:49<21:36, 10.62it/s, v_num=o30c, train/loss=2.80\r", + "Epoch 0: 8%| | 1160/14932 [01:49<21:36, 10.62it/s, v_num=o30c, train/loss=4.19" ] }, { @@ -10377,7 +11690,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 82%[===============> ] 1.32G 4.85MB/s eta 65s " + "Epoch 0: 8%| | 1161/14932 [01:49<21:36, 10.62it/s, v_num=o30c, train/loss=4.19\r", + "Epoch 0: 8%| | 1161/14932 [01:49<21:36, 10.62it/s, v_num=o30c, train/loss=3.08" ] }, { @@ -10385,7 +11699,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 82%[===============> ] 1.32G 4.97MB/s eta 65s " + "Epoch 0: 8%| | 1162/14932 [01:49<21:36, 10.62it/s, v_num=o30c, train/loss=3.08\r", + "Epoch 0: 8%| | 1162/14932 [01:49<21:36, 10.62it/s, v_num=o30c, train/loss=2.17" ] }, { @@ -10393,7 +11708,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 82%[===============> ] 1.32G 4.85MB/s eta 65s " + "Epoch 0: 8%| | 1163/14932 [01:49<21:35, 10.63it/s, v_num=o30c, train/loss=2.17\r", + "Epoch 0: 8%| | 1163/14932 [01:49<21:35, 10.63it/s, v_num=o30c, train/loss=4.09" ] }, { @@ -10401,7 +11717,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 82%[===============> ] 1.32G 5.10MB/s eta 64s " + "Epoch 0: 8%| | 1164/14932 [01:49<21:35, 10.63it/s, v_num=o30c, train/loss=4.09\r", + "Epoch 0: 8%| | 1164/14932 [01:49<21:35, 10.63it/s, v_num=o30c, train/loss=3.12" ] }, { @@ -10409,7 +11726,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 82%[===============> ] 1.33G 5.03MB/s eta 64s " + "Epoch 0: 8%| | 1165/14932 [01:49<21:34, 10.63it/s, v_num=o30c, train/loss=3.12\r", + "Epoch 0: 8%| | 1165/14932 [01:49<21:34, 10.63it/s, v_num=o30c, train/loss=3.58" ] }, { @@ -10417,7 +11735,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 82%[===============> ] 1.33G 5.16MB/s eta 64s " + "Epoch 0: 8%| | 1166/14932 [01:49<21:34, 10.63it/s, v_num=o30c, train/loss=3.58\r", + "Epoch 0: 8%| | 1166/14932 [01:49<21:34, 10.63it/s, v_num=o30c, train/loss=2.33" ] }, { @@ -10425,7 +11744,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 82%[===============> ] 1.33G 5.14MB/s eta 64s " + "Epoch 0: 8%| | 1167/14932 [01:49<21:34, 10.64it/s, v_num=o30c, train/loss=2.33\r", + "Epoch 0: 8%| | 1167/14932 [01:49<21:34, 10.64it/s, v_num=o30c, train/loss=4.66" ] }, { @@ -10433,7 +11753,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 82%[===============> ] 1.33G 5.04MB/s eta 64s " + "Epoch 0: 8%| | 1168/14932 [01:49<21:33, 10.64it/s, v_num=o30c, train/loss=4.66\r", + "Epoch 0: 8%| | 1168/14932 [01:49<21:33, 10.64it/s, v_num=o30c, train/loss=4.09" ] }, { @@ -10441,7 +11762,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 82%[===============> ] 1.33G 5.14MB/s eta 62s " + "Epoch 0: 8%| | 1169/14932 [01:49<21:33, 10.64it/s, v_num=o30c, train/loss=4.09\r", + "Epoch 0: 8%| | 1169/14932 [01:49<21:33, 10.64it/s, v_num=o30c, train/loss=3.05" ] }, { @@ -10449,7 +11771,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 83%[===============> ] 1.33G 5.09MB/s eta 62s " + "Epoch 0: 8%| | 1170/14932 [01:49<21:32, 10.64it/s, v_num=o30c, train/loss=3.05\r", + "Epoch 0: 8%| | 1170/14932 [01:49<21:32, 10.64it/s, v_num=o30c, train/loss=2.64" ] }, { @@ -10457,7 +11780,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 83%[===============> ] 1.33G 5.28MB/s eta 62s " + "Epoch 0: 8%| | 1171/14932 [01:50<21:33, 10.64it/s, v_num=o30c, train/loss=2.64\r", + "Epoch 0: 8%| | 1171/14932 [01:50<21:33, 10.64it/s, v_num=o30c, train/loss=2.98" ] }, { @@ -10465,7 +11789,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 83%[===============> ] 1.33G 5.24MB/s eta 62s " + "Epoch 0: 8%| | 1172/14932 [01:50<21:33, 10.64it/s, v_num=o30c, train/loss=2.98\r", + "Epoch 0: 8%| | 1172/14932 [01:50<21:33, 10.64it/s, v_num=o30c, train/loss=3.80" ] }, { @@ -10473,7 +11798,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 83%[===============> ] 1.33G 5.17MB/s eta 62s " + "Epoch 0: 8%| | 1173/14932 [01:50<21:34, 10.63it/s, v_num=o30c, train/loss=3.80\r", + "Epoch 0: 8%| | 1173/14932 [01:50<21:34, 10.63it/s, v_num=o30c, train/loss=3.78" ] }, { @@ -10481,7 +11807,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 83%[===============> ] 1.34G 5.26MB/s eta 61s " + "Epoch 0: 8%| | 1174/14932 [01:50<21:33, 10.64it/s, v_num=o30c, train/loss=3.78\r", + "Epoch 0: 8%| | 1174/14932 [01:50<21:33, 10.64it/s, v_num=o30c, train/loss=2.03" ] }, { @@ -10489,7 +11816,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 83%[===============> ] 1.34G 5.19MB/s eta 61s " + "Epoch 0: 8%| | 1175/14932 [01:50<21:33, 10.64it/s, v_num=o30c, train/loss=2.03\r", + "Epoch 0: 8%| | 1175/14932 [01:50<21:33, 10.64it/s, v_num=o30c, train/loss=3.64" ] }, { @@ -10497,7 +11825,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 83%[===============> ] 1.34G 5.28MB/s eta 61s " + "Epoch 0: 8%| | 1176/14932 [01:50<21:32, 10.64it/s, v_num=o30c, train/loss=3.64\r", + "Epoch 0: 8%| | 1176/14932 [01:50<21:32, 10.64it/s, v_num=o30c, train/loss=3.22" ] }, { @@ -10505,7 +11834,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 83%[===============> ] 1.34G 5.16MB/s eta 61s " + "Epoch 0: 8%| | 1177/14932 [01:50<21:32, 10.64it/s, v_num=o30c, train/loss=3.22\r", + "Epoch 0: 8%| | 1177/14932 [01:50<21:32, 10.64it/s, v_num=o30c, train/loss=1.08" ] }, { @@ -10513,7 +11843,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 83%[===============> ] 1.34G 5.29MB/s eta 61s " + "Epoch 0: 8%| | 1178/14932 [01:50<21:32, 10.65it/s, v_num=o30c, train/loss=1.08\r", + "Epoch 0: 8%| | 1178/14932 [01:50<21:32, 10.65it/s, v_num=o30c, train/loss=3.06" ] }, { @@ -10521,7 +11852,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 83%[===============> ] 1.34G 5.17MB/s eta 59s " + "Epoch 0: 8%| | 1179/14932 [01:50<21:31, 10.65it/s, v_num=o30c, train/loss=3.06\r", + "Epoch 0: 8%| | 1179/14932 [01:50<21:31, 10.65it/s, v_num=o30c, train/loss=2.38" ] }, { @@ -10529,7 +11861,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 83%[===============> ] 1.34G 5.29MB/s eta 59s " + "Epoch 0: 8%| | 1180/14932 [01:50<21:31, 10.65it/s, v_num=o30c, train/loss=2.38\r", + "Epoch 0: 8%| | 1180/14932 [01:50<21:31, 10.65it/s, v_num=o30c, train/loss=3.27" ] }, { @@ -10537,7 +11870,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 83%[===============> ] 1.34G 5.18MB/s eta 59s " + "Epoch 0: 8%| | 1181/14932 [01:50<21:30, 10.65it/s, v_num=o30c, train/loss=3.27\r", + "Epoch 0: 8%| | 1181/14932 [01:50<21:30, 10.65it/s, v_num=o30c, train/loss=3.08" ] }, { @@ -10545,7 +11879,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 83%[===============> ] 1.35G 5.35MB/s eta 59s " + "Epoch 0: 8%| | 1182/14932 [01:50<21:30, 10.65it/s, v_num=o30c, train/loss=3.08\r", + "Epoch 0: 8%| | 1182/14932 [01:50<21:30, 10.65it/s, v_num=o30c, train/loss=2.95" ] }, { @@ -10553,7 +11888,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 84%[===============> ] 1.35G 5.25MB/s eta 59s " + "Epoch 0: 8%| | 1183/14932 [01:51<21:30, 10.66it/s, v_num=o30c, train/loss=2.95\r", + "Epoch 0: 8%| | 1183/14932 [01:51<21:30, 10.66it/s, v_num=o30c, train/loss=2.39" ] }, { @@ -10561,7 +11897,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 84%[===============> ] 1.35G 5.22MB/s eta 58s " + "Epoch 0: 8%| | 1184/14932 [01:51<21:31, 10.65it/s, v_num=o30c, train/loss=2.39\r", + "Epoch 0: 8%| | 1184/14932 [01:51<21:31, 10.65it/s, v_num=o30c, train/loss=3.22" ] }, { @@ -10569,7 +11906,8 @@ "output_type": "stream", "text": [ "\r", - "pth 84%[===============> ] 1.35G 5.30MB/s eta 58s " + "Epoch 0: 8%| | 1185/14932 [01:51<21:30, 10.65it/s, v_num=o30c, train/loss=3.22\r", + "Epoch 0: 8%| | 1185/14932 [01:51<21:30, 10.65it/s, v_num=o30c, train/loss=3.05" ] }, { @@ -10577,7 +11915,8 @@ "output_type": "stream", "text": [ "\r", - "th 84%[===============> ] 1.35G 5.22MB/s eta 58s " + "Epoch 0: 8%| | 1186/14932 [01:51<21:30, 10.65it/s, v_num=o30c, train/loss=3.05\r", + "Epoch 0: 8%| | 1186/14932 [01:51<21:30, 10.65it/s, v_num=o30c, train/loss=4.03" ] }, { @@ -10585,7 +11924,8 @@ "output_type": "stream", "text": [ "\r", - "h 84%[===============> ] 1.35G 5.34MB/s eta 58s " + "Epoch 0: 8%| | 1187/14932 [01:51<21:29, 10.66it/s, v_num=o30c, train/loss=4.03\r", + "Epoch 0: 8%| | 1187/14932 [01:51<21:29, 10.66it/s, v_num=o30c, train/loss=2.73" ] }, { @@ -10593,7 +11933,8 @@ "output_type": "stream", "text": [ "\r", - " 84%[===============> ] 1.35G 5.23MB/s eta 58s " + "Epoch 0: 8%| | 1188/14932 [01:51<21:29, 10.66it/s, v_num=o30c, train/loss=2.73\r", + "Epoch 0: 8%| | 1188/14932 [01:51<21:29, 10.66it/s, v_num=o30c, train/loss=2.08" ] }, { @@ -10601,7 +11942,8 @@ "output_type": "stream", "text": [ "\r", - " v 84%[===============> ] 1.35G 5.42MB/s eta 56s " + "Epoch 0: 8%| | 1189/14932 [01:51<21:29, 10.66it/s, v_num=o30c, train/loss=2.08\r", + "Epoch 0: 8%| | 1189/14932 [01:51<21:29, 10.66it/s, v_num=o30c, train/loss=2.91" ] }, { @@ -10609,7 +11951,8 @@ "output_type": "stream", "text": [ "\r", - " v5 84%[===============> ] 1.36G 5.42MB/s eta 56s " + "Epoch 0: 8%| | 1190/14932 [01:51<21:28, 10.66it/s, v_num=o30c, train/loss=2.91\r", + "Epoch 0: 8%| | 1190/14932 [01:51<21:29, 10.66it/s, v_num=o30c, train/loss=3.02" ] }, { @@ -10617,7 +11960,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 84%[===============> ] 1.36G 5.32MB/s eta 56s " + "Epoch 0: 8%| | 1191/14932 [01:51<21:28, 10.66it/s, v_num=o30c, train/loss=3.02\r", + "Epoch 0: 8%| | 1191/14932 [01:51<21:28, 10.66it/s, v_num=o30c, train/loss=2.22" ] }, { @@ -10625,7 +11969,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 84%[===============> ] 1.36G 5.32MB/s eta 56s " + "Epoch 0: 8%| | 1192/14932 [01:51<21:28, 10.66it/s, v_num=o30c, train/loss=2.22\r", + "Epoch 0: 8%| | 1192/14932 [01:51<21:28, 10.66it/s, v_num=o30c, train/loss=4.09" ] }, { @@ -10633,7 +11978,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 84%[===============> ] 1.36G 5.28MB/s eta 56s " + "Epoch 0: 8%| | 1193/14932 [01:51<21:28, 10.67it/s, v_num=o30c, train/loss=4.09\r", + "Epoch 0: 8%| | 1193/14932 [01:51<21:28, 10.67it/s, v_num=o30c, train/loss=2.45" ] }, { @@ -10641,7 +11987,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 84%[===============> ] 1.36G 5.22MB/s eta 55s " + "Epoch 0: 8%| | 1194/14932 [01:51<21:27, 10.67it/s, v_num=o30c, train/loss=2.45\r", + "Epoch 0: 8%| | 1194/14932 [01:51<21:27, 10.67it/s, v_num=o30c, train/loss=3.14" ] }, { @@ -10649,7 +11996,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 84%[===============> ] 1.36G 5.29MB/s eta 55s " + "Epoch 0: 8%| | 1195/14932 [01:51<21:27, 10.67it/s, v_num=o30c, train/loss=3.14\r", + "Epoch 0: 8%| | 1195/14932 [01:51<21:27, 10.67it/s, v_num=o30c, train/loss=3.98" ] }, { @@ -10657,7 +12005,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 84%[===============> ] 1.36G 5.21MB/s eta 55s " + "Epoch 0: 8%| | 1196/14932 [01:52<21:27, 10.67it/s, v_num=o30c, train/loss=3.98\r", + "Epoch 0: 8%| | 1196/14932 [01:52<21:27, 10.67it/s, v_num=o30c, train/loss=3.66" ] }, { @@ -10665,7 +12014,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 85%[================> ] 1.36G 5.35MB/s eta 55s " + "Epoch 0: 8%| | 1197/14932 [01:52<21:26, 10.67it/s, v_num=o30c, train/loss=3.66\r", + "Epoch 0: 8%| | 1197/14932 [01:52<21:26, 10.67it/s, v_num=o30c, train/loss=3.52" ] }, { @@ -10673,7 +12023,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 85%[================> ] 1.36G 5.27MB/s eta 55s " + "Epoch 0: 8%| | 1198/14932 [01:52<21:26, 10.68it/s, v_num=o30c, train/loss=3.52\r", + "Epoch 0: 8%| | 1198/14932 [01:52<21:26, 10.68it/s, v_num=o30c, train/loss=2.25" ] }, { @@ -10681,7 +12032,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 85%[================> ] 1.37G 5.38MB/s eta 54s " + "Epoch 0: 8%| | 1199/14932 [01:52<21:26, 10.67it/s, v_num=o30c, train/loss=2.25\r", + "Epoch 0: 8%| | 1199/14932 [01:52<21:26, 10.67it/s, v_num=o30c, train/loss=3.67" ] }, { @@ -10689,7 +12041,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 85%[================> ] 1.37G 5.41MB/s eta 54s " + "Epoch 0: 8%| | 1200/14932 [01:52<21:26, 10.67it/s, v_num=o30c, train/loss=3.67\r", + "Epoch 0: 8%| | 1200/14932 [01:52<21:26, 10.67it/s, v_num=o30c, train/loss=3.16" ] }, { @@ -10697,7 +12050,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 85%[================> ] 1.37G 5.50MB/s eta 54s " + "Epoch 0: 8%| | 1201/14932 [01:52<21:26, 10.68it/s, v_num=o30c, train/loss=3.16\r", + "Epoch 0: 8%| | 1201/14932 [01:52<21:26, 10.68it/s, v_num=o30c, train/loss=2.78" ] }, { @@ -10705,7 +12059,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 85%[================> ] 1.37G 5.60MB/s eta 54s " + "Epoch 0: 8%| | 1202/14932 [01:52<21:25, 10.68it/s, v_num=o30c, train/loss=2.78\r", + "Epoch 0: 8%| | 1202/14932 [01:52<21:25, 10.68it/s, v_num=o30c, train/loss=3.47" ] }, { @@ -10713,7 +12068,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 85%[================> ] 1.37G 5.53MB/s eta 54s " + "Epoch 0: 8%| | 1203/14932 [01:52<21:25, 10.68it/s, v_num=o30c, train/loss=3.47\r", + "Epoch 0: 8%| | 1203/14932 [01:52<21:25, 10.68it/s, v_num=o30c, train/loss=2.72" ] }, { @@ -10721,7 +12077,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 85%[================> ] 1.37G 5.54MB/s eta 52s " + "Epoch 0: 8%| | 1204/14932 [01:52<21:25, 10.68it/s, v_num=o30c, train/loss=2.72\r", + "Epoch 0: 8%| | 1204/14932 [01:52<21:25, 10.68it/s, v_num=o30c, train/loss=2.52" ] }, { @@ -10729,7 +12086,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 85%[================> ] 1.37G 5.46MB/s eta 52s " + "Epoch 0: 8%| | 1205/14932 [01:52<21:24, 10.69it/s, v_num=o30c, train/loss=2.52\r", + "Epoch 0: 8%| | 1205/14932 [01:52<21:24, 10.69it/s, v_num=o30c, train/loss=2.88" ] }, { @@ -10737,7 +12095,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 85%[================> ] 1.37G 5.55MB/s eta 52s " + "Epoch 0: 8%| | 1206/14932 [01:52<21:24, 10.69it/s, v_num=o30c, train/loss=2.88\r", + "Epoch 0: 8%| | 1206/14932 [01:52<21:24, 10.69it/s, v_num=o30c, train/loss=4.00" ] }, { @@ -10745,7 +12104,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 85%[================> ] 1.38G 5.54MB/s eta 52s " + "Epoch 0: 8%| | 1207/14932 [01:52<21:24, 10.68it/s, v_num=o30c, train/loss=4.00\r", + "Epoch 0: 8%| | 1207/14932 [01:52<21:24, 10.68it/s, v_num=o30c, train/loss=3.16" ] }, { @@ -10753,7 +12113,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 85%[================> ] 1.38G 5.52MB/s eta 52s " + "Epoch 0: 8%| | 1208/14932 [01:53<21:24, 10.69it/s, v_num=o30c, train/loss=3.16\r", + "Epoch 0: 8%| | 1208/14932 [01:53<21:24, 10.69it/s, v_num=o30c, train/loss=2.33" ] }, { @@ -10761,7 +12122,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 85%[================> ] 1.38G 5.73MB/s eta 51s " + "Epoch 0: 8%| | 1209/14932 [01:53<21:24, 10.69it/s, v_num=o30c, train/loss=2.33\r", + "Epoch 0: 8%| | 1209/14932 [01:53<21:24, 10.69it/s, v_num=o30c, train/loss=4.12" ] }, { @@ -10769,7 +12131,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 86%[================> ] 1.38G 5.74MB/s eta 51s " + "Epoch 0: 8%| | 1210/14932 [01:53<21:23, 10.69it/s, v_num=o30c, train/loss=4.12\r", + "Epoch 0: 8%| | 1210/14932 [01:53<21:23, 10.69it/s, v_num=o30c, train/loss=2.88" ] }, { @@ -10777,7 +12140,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 86%[================> ] 1.38G 5.76MB/s eta 51s " + "Epoch 0: 8%| | 1211/14932 [01:53<21:23, 10.69it/s, v_num=o30c, train/loss=2.88\r", + "Epoch 0: 8%| | 1211/14932 [01:53<21:23, 10.69it/s, v_num=o30c, train/loss=2.28" ] }, { @@ -10785,7 +12149,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 86%[================> ] 1.38G 6.05MB/s eta 51s " + "Epoch 0: 8%| | 1212/14932 [01:53<21:23, 10.69it/s, v_num=o30c, train/loss=2.28\r", + "Epoch 0: 8%| | 1212/14932 [01:53<21:23, 10.69it/s, v_num=o30c, train/loss=3.25" ] }, { @@ -10793,7 +12158,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 86%[================> ] 1.38G 6.01MB/s eta 51s " + "Epoch 0: 8%| | 1213/14932 [01:53<21:22, 10.69it/s, v_num=o30c, train/loss=3.25\r", + "Epoch 0: 8%| | 1213/14932 [01:53<21:22, 10.69it/s, v_num=o30c, train/loss=2.72" ] }, { @@ -10801,7 +12167,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 86%[================> ] 1.38G 6.04MB/s eta 49s " + "Epoch 0: 8%| | 1214/14932 [01:53<21:22, 10.70it/s, v_num=o30c, train/loss=2.72\r", + "Epoch 0: 8%| | 1214/14932 [01:53<21:22, 10.70it/s, v_num=o30c, train/loss=2.58" ] }, { @@ -10809,7 +12176,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 86%[================> ] 1.39G 6.20MB/s eta 49s " + "Epoch 0: 8%| | 1215/14932 [01:53<21:23, 10.69it/s, v_num=o30c, train/loss=2.58\r", + "Epoch 0: 8%| | 1215/14932 [01:53<21:23, 10.69it/s, v_num=o30c, train/loss=4.06" ] }, { @@ -10817,7 +12185,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 86%[================> ] 1.39G 6.03MB/s eta 49s " + "Epoch 0: 8%| | 1216/14932 [01:53<21:24, 10.68it/s, v_num=o30c, train/loss=4.06\r", + "Epoch 0: 8%| | 1216/14932 [01:53<21:24, 10.68it/s, v_num=o30c, train/loss=3.62" ] }, { @@ -10825,7 +12194,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 86%[================> ] 1.39G 6.21MB/s eta 49s " + "Epoch 0: 8%| | 1217/14932 [01:53<21:24, 10.68it/s, v_num=o30c, train/loss=3.62\r", + "Epoch 0: 8%| | 1217/14932 [01:53<21:24, 10.68it/s, v_num=o30c, train/loss=4.22" ] }, { @@ -10833,7 +12203,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 86%[================> ] 1.39G 6.20MB/s eta 49s " + "Epoch 0: 8%| | 1218/14932 [01:54<21:23, 10.68it/s, v_num=o30c, train/loss=4.22\r", + "Epoch 0: 8%| | 1218/14932 [01:54<21:23, 10.68it/s, v_num=o30c, train/loss=3.42" ] }, { @@ -10841,7 +12212,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 86%[================> ] 1.39G 6.50MB/s eta 48s " + "Epoch 0: 8%| | 1219/14932 [01:54<21:23, 10.68it/s, v_num=o30c, train/loss=3.42\r", + "Epoch 0: 8%| | 1219/14932 [01:54<21:23, 10.68it/s, v_num=o30c, train/loss=3.45" ] }, { @@ -10849,7 +12221,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 86%[================> ] 1.39G 6.51MB/s eta 48s " + "Epoch 0: 8%| | 1220/14932 [01:54<21:23, 10.69it/s, v_num=o30c, train/loss=3.45\r", + "Epoch 0: 8%| | 1220/14932 [01:54<21:23, 10.69it/s, v_num=o30c, train/loss=2.09" ] }, { @@ -10857,7 +12230,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 87%[================> ] 1.39G 6.65MB/s eta 48s " + "Epoch 0: 8%| | 1221/14932 [01:54<21:22, 10.69it/s, v_num=o30c, train/loss=2.09\r", + "Epoch 0: 8%| | 1221/14932 [01:54<21:22, 10.69it/s, v_num=o30c, train/loss=4.25" ] }, { @@ -10865,7 +12239,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 87%[================> ] 1.40G 7.11MB/s eta 48s " + "Epoch 0: 8%| | 1222/14932 [01:54<21:22, 10.69it/s, v_num=o30c, train/loss=4.25\r", + "Epoch 0: 8%| | 1222/14932 [01:54<21:22, 10.69it/s, v_num=o30c, train/loss=2.27" ] }, { @@ -10873,7 +12248,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 87%[================> ] 1.40G 6.77MB/s eta 48s " + "Epoch 0: 8%| | 1223/14932 [01:54<21:22, 10.69it/s, v_num=o30c, train/loss=2.27\r", + "Epoch 0: 8%| | 1223/14932 [01:54<21:22, 10.69it/s, v_num=o30c, train/loss=2.81" ] }, { @@ -10881,7 +12257,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 87%[================> ] 1.40G 6.93MB/s eta 46s " + "Epoch 0: 8%| | 1224/14932 [01:54<21:22, 10.69it/s, v_num=o30c, train/loss=2.81\r", + "Epoch 0: 8%| | 1224/14932 [01:54<21:22, 10.69it/s, v_num=o30c, train/loss=2.98" ] }, { @@ -10889,7 +12266,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 87%[================> ] 1.40G 6.87MB/s eta 46s " + "Epoch 0: 8%| | 1225/14932 [01:54<21:21, 10.69it/s, v_num=o30c, train/loss=2.98\r", + "Epoch 0: 8%| | 1225/14932 [01:54<21:21, 10.69it/s, v_num=o30c, train/loss=3.45" ] }, { @@ -10897,7 +12275,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 87%[================> ] 1.40G 7.03MB/s eta 46s " + "Epoch 0: 8%| | 1226/14932 [01:54<21:21, 10.70it/s, v_num=o30c, train/loss=3.45\r", + "Epoch 0: 8%| | 1226/14932 [01:54<21:21, 10.70it/s, v_num=o30c, train/loss=2.98" ] }, { @@ -10905,7 +12284,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 87%[================> ] 1.40G 7.15MB/s eta 46s " + "Epoch 0: 8%| | 1227/14932 [01:54<21:21, 10.70it/s, v_num=o30c, train/loss=2.98\r", + "Epoch 0: 8%| | 1227/14932 [01:54<21:21, 10.70it/s, v_num=o30c, train/loss=3.20" ] }, { @@ -10913,7 +12293,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 87%[================> ] 1.41G 7.37MB/s eta 46s " + "Epoch 0: 8%| | 1228/14932 [01:54<21:20, 10.70it/s, v_num=o30c, train/loss=3.20\r", + "Epoch 0: 8%| | 1228/14932 [01:54<21:20, 10.70it/s, v_num=o30c, train/loss=3.25" ] }, { @@ -10921,7 +12302,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 87%[================> ] 1.41G 7.27MB/s eta 44s " + "Epoch 0: 8%| | 1229/14932 [01:54<21:20, 10.70it/s, v_num=o30c, train/loss=3.25\r", + "Epoch 0: 8%| | 1229/14932 [01:54<21:20, 10.70it/s, v_num=o30c, train/loss=2.66" ] }, { @@ -10929,7 +12311,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 87%[================> ] 1.41G 7.74MB/s eta 44s " + "Epoch 0: 8%| | 1230/14932 [01:54<21:20, 10.70it/s, v_num=o30c, train/loss=2.66\r", + "Epoch 0: 8%| | 1230/14932 [01:54<21:21, 10.70it/s, v_num=o30c, train/loss=3.36" ] }, { @@ -10937,7 +12320,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 88%[================> ] 1.41G 8.05MB/s eta 44s " + "Epoch 0: 8%| | 1231/14932 [01:55<21:20, 10.70it/s, v_num=o30c, train/loss=3.36\r", + "Epoch 0: 8%| | 1231/14932 [01:55<21:20, 10.70it/s, v_num=o30c, train/loss=3.27" ] }, { @@ -10945,7 +12329,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 88%[================> ] 1.41G 8.00MB/s eta 44s " + "Epoch 0: 8%| | 1232/14932 [01:55<21:20, 10.70it/s, v_num=o30c, train/loss=3.27\r", + "Epoch 0: 8%| | 1232/14932 [01:55<21:20, 10.70it/s, v_num=o30c, train/loss=2.67" ] }, { @@ -10953,7 +12338,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 88%[================> ] 1.42G 8.24MB/s eta 44s " + "Epoch 0: 8%| | 1233/14932 [01:55<21:20, 10.70it/s, v_num=o30c, train/loss=2.67\r", + "Epoch 0: 8%| | 1233/14932 [01:55<21:20, 10.70it/s, v_num=o30c, train/loss=2.66" ] }, { @@ -10961,7 +12347,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 88%[================> ] 1.42G 8.18MB/s eta 42s " + "Epoch 0: 8%| | 1234/14932 [01:55<21:19, 10.70it/s, v_num=o30c, train/loss=2.66\r", + "Epoch 0: 8%| | 1234/14932 [01:55<21:19, 10.70it/s, v_num=o30c, train/loss=4.62" ] }, { @@ -10969,7 +12356,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 88%[================> ] 1.42G 8.37MB/s eta 42s " + "Epoch 0: 8%| | 1235/14932 [01:55<21:19, 10.71it/s, v_num=o30c, train/loss=4.62\r", + "Epoch 0: 8%| | 1235/14932 [01:55<21:19, 10.71it/s, v_num=o30c, train/loss=4.06" ] }, { @@ -10977,7 +12365,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 88%[================> ] 1.42G 8.58MB/s eta 42s " + "Epoch 0: 8%| | 1236/14932 [01:55<21:19, 10.71it/s, v_num=o30c, train/loss=4.06\r", + "Epoch 0: 8%| | 1236/14932 [01:55<21:19, 10.71it/s, v_num=o30c, train/loss=1.63" ] }, { @@ -10985,7 +12374,8 @@ "output_type": "stream", "text": [ "\r", - "pth 88%[================> ] 1.42G 8.86MB/s eta 42s " + "Epoch 0: 8%| | 1237/14932 [01:55<21:18, 10.71it/s, v_num=o30c, train/loss=1.63\r", + "Epoch 0: 8%| | 1237/14932 [01:55<21:18, 10.71it/s, v_num=o30c, train/loss=1.84" ] }, { @@ -10993,7 +12383,8 @@ "output_type": "stream", "text": [ "\r", - "th 88%[================> ] 1.43G 9.26MB/s eta 42s " + "Epoch 0: 8%| | 1238/14932 [01:55<21:18, 10.71it/s, v_num=o30c, train/loss=1.84\r", + "Epoch 0: 8%| | 1238/14932 [01:55<21:18, 10.71it/s, v_num=o30c, train/loss=3.56" ] }, { @@ -11001,7 +12392,8 @@ "output_type": "stream", "text": [ "\r", - "h 89%[================> ] 1.43G 9.46MB/s eta 39s " + "Epoch 0: 8%| | 1239/14932 [01:55<21:17, 10.71it/s, v_num=o30c, train/loss=3.56\r", + "Epoch 0: 8%| | 1239/14932 [01:55<21:17, 10.71it/s, v_num=o30c, train/loss=1.18" ] }, { @@ -11009,7 +12401,8 @@ "output_type": "stream", "text": [ "\r", - " 89%[================> ] 1.43G 9.86MB/s eta 39s " + "Epoch 0: 8%| | 1240/14932 [01:55<21:17, 10.72it/s, v_num=o30c, train/loss=1.18\r", + "Epoch 0: 8%| | 1240/14932 [01:55<21:17, 10.72it/s, v_num=o30c, train/loss=3.92" ] }, { @@ -11017,7 +12410,8 @@ "output_type": "stream", "text": [ "\r", - " v 89%[================> ] 1.43G 9.82MB/s eta 39s " + "Epoch 0: 8%| | 1241/14932 [01:55<21:17, 10.72it/s, v_num=o30c, train/loss=3.92\r", + "Epoch 0: 8%| | 1241/14932 [01:55<21:17, 10.72it/s, v_num=o30c, train/loss=2.06" ] }, { @@ -11025,7 +12419,8 @@ "output_type": "stream", "text": [ "\r", - " v5 89%[================> ] 1.44G 10.0MB/s eta 39s " + "Epoch 0: 8%| | 1242/14932 [01:55<21:17, 10.72it/s, v_num=o30c, train/loss=2.06\r", + "Epoch 0: 8%| | 1242/14932 [01:55<21:17, 10.72it/s, v_num=o30c, train/loss=3.45" ] }, { @@ -11033,7 +12428,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 89%[================> ] 1.44G 10.4MB/s eta 39s " + "Epoch 0: 8%| | 1243/14932 [01:55<21:16, 10.72it/s, v_num=o30c, train/loss=3.45\r", + "Epoch 0: 8%| | 1243/14932 [01:55<21:16, 10.72it/s, v_num=o30c, train/loss=2.12" ] }, { @@ -11041,7 +12437,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 89%[================> ] 1.44G 9.95MB/s eta 36s " + "Epoch 0: 8%| | 1244/14932 [01:56<21:16, 10.72it/s, v_num=o30c, train/loss=2.12\r", + "Epoch 0: 8%| | 1244/14932 [01:56<21:16, 10.72it/s, v_num=o30c, train/loss=2.98" ] }, { @@ -11049,7 +12446,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 89%[================> ] 1.44G 10.5MB/s eta 36s " + "Epoch 0: 8%| | 1245/14932 [01:56<21:16, 10.72it/s, v_num=o30c, train/loss=2.98\r", + "Epoch 0: 8%| | 1245/14932 [01:56<21:16, 10.72it/s, v_num=o30c, train/loss=2.33" ] }, { @@ -11057,7 +12455,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 90%[=================> ] 1.44G 10.5MB/s eta 36s " + "Epoch 0: 8%| | 1246/14932 [01:56<21:15, 10.73it/s, v_num=o30c, train/loss=2.33\r", + "Epoch 0: 8%| | 1246/14932 [01:56<21:15, 10.73it/s, v_num=o30c, train/loss=1.73" ] }, { @@ -11065,7 +12464,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 90%[=================> ] 1.45G 10.6MB/s eta 36s " + "Epoch 0: 8%| | 1247/14932 [01:56<21:15, 10.73it/s, v_num=o30c, train/loss=1.73\r", + "Epoch 0: 8%| | 1247/14932 [01:56<21:15, 10.73it/s, v_num=o30c, train/loss=2.23" ] }, { @@ -11073,7 +12473,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 90%[=================> ] 1.45G 10.9MB/s eta 36s " + "Epoch 0: 8%| | 1248/14932 [01:56<21:16, 10.72it/s, v_num=o30c, train/loss=2.23\r", + "Epoch 0: 8%| | 1248/14932 [01:56<21:16, 10.72it/s, v_num=o30c, train/loss=3.64" ] }, { @@ -11081,7 +12482,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 90%[=================> ] 1.45G 11.3MB/s eta 33s " + "Epoch 0: 8%| | 1249/14932 [01:56<21:16, 10.72it/s, v_num=o30c, train/loss=3.64\r", + "Epoch 0: 8%| | 1249/14932 [01:56<21:16, 10.72it/s, v_num=o30c, train/loss=3.67" ] }, { @@ -11089,7 +12491,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 90%[=================> ] 1.46G 11.3MB/s eta 33s " + "Epoch 0: 8%| | 1250/14932 [01:56<21:16, 10.72it/s, v_num=o30c, train/loss=3.67\r", + "Epoch 0: 8%| | 1250/14932 [01:56<21:16, 10.72it/s, v_num=o30c, train/loss=2.66" ] }, { @@ -11097,7 +12500,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 91%[=================> ] 1.46G 11.6MB/s eta 33s " + "Epoch 0: 8%| | 1251/14932 [01:56<21:15, 10.72it/s, v_num=o30c, train/loss=2.66\r", + "Epoch 0: 8%| | 1251/14932 [01:56<21:15, 10.72it/s, v_num=o30c, train/loss=2.00" ] }, { @@ -11105,7 +12509,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 91%[=================> ] 1.46G 12.0MB/s eta 33s " + "Epoch 0: 8%| | 1252/14932 [01:56<21:15, 10.73it/s, v_num=o30c, train/loss=2.00\r", + "Epoch 0: 8%| | 1252/14932 [01:56<21:15, 10.73it/s, v_num=o30c, train/loss=3.27" ] }, { @@ -11113,7 +12518,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 91%[=================> ] 1.47G 12.1MB/s eta 33s " + "Epoch 0: 8%| | 1253/14932 [01:56<21:15, 10.73it/s, v_num=o30c, train/loss=3.27\r", + "Epoch 0: 8%| | 1253/14932 [01:56<21:15, 10.73it/s, v_num=o30c, train/loss=3.44" ] }, { @@ -11121,7 +12527,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048 91%[=================> ] 1.47G 12.4MB/s eta 29s " + "Epoch 0: 8%| | 1254/14932 [01:56<21:14, 10.73it/s, v_num=o30c, train/loss=3.44\r", + "Epoch 0: 8%| | 1254/14932 [01:56<21:14, 10.73it/s, v_num=o30c, train/loss=3.47" ] }, { @@ -11129,7 +12536,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048- 92%[=================> ] 1.47G 12.8MB/s eta 29s " + "Epoch 0: 8%| | 1255/14932 [01:56<21:14, 10.73it/s, v_num=o30c, train/loss=3.47\r", + "Epoch 0: 8%| | 1255/14932 [01:56<21:14, 10.73it/s, v_num=o30c, train/loss=1.16" ] }, { @@ -11137,7 +12545,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E 92%[=================> ] 1.48G 12.9MB/s eta 29s " + "Epoch 0: 8%| | 1256/14932 [01:57<21:14, 10.73it/s, v_num=o30c, train/loss=1.16\r", + "Epoch 0: 8%| | 1256/14932 [01:57<21:14, 10.73it/s, v_num=o30c, train/loss=4.56" ] }, { @@ -11145,7 +12554,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0 92%[=================> ] 1.48G 13.2MB/s eta 29s " + "Epoch 0: 8%| | 1257/14932 [01:57<21:13, 10.73it/s, v_num=o30c, train/loss=4.56\r", + "Epoch 0: 8%| | 1257/14932 [01:57<21:13, 10.73it/s, v_num=o30c, train/loss=3.75" ] }, { @@ -11153,7 +12563,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2048-E0_ 92%[=================> ] 1.49G 12.7MB/s eta 25s " + "Epoch 0: 8%| | 1258/14932 [01:57<21:13, 10.74it/s, v_num=o30c, train/loss=3.75\r", + "Epoch 0: 8%| | 1258/14932 [01:57<21:13, 10.74it/s, v_num=o30c, train/loss=3.53" ] }, { @@ -11161,7 +12572,8 @@ "output_type": "stream", "text": [ "\r", - "v5r3-L12-D2048-E0_1 92%[=================> ] 1.49G 12.8MB/s eta 25s " + "Epoch 0: 8%| | 1259/14932 [01:57<21:13, 10.74it/s, v_num=o30c, train/loss=3.53\r", + "Epoch 0: 8%| | 1259/14932 [01:57<21:13, 10.74it/s, v_num=o30c, train/loss=3.22" ] }, { @@ -11169,7 +12581,8 @@ "output_type": "stream", "text": [ "\r", - "5r3-L12-D2048-E0_1- 93%[=================> ] 1.49G 12.8MB/s eta 25s " + "Epoch 0: 8%| | 1260/14932 [01:57<21:12, 10.74it/s, v_num=o30c, train/loss=3.22\r", + "Epoch 0: 8%| | 1260/14932 [01:57<21:12, 10.74it/s, v_num=o30c, train/loss=3.92" ] }, { @@ -11177,7 +12590,8 @@ "output_type": "stream", "text": [ "\r", - "r3-L12-D2048-E0_1-e 93%[=================> ] 1.49G 13.0MB/s eta 25s " + "Epoch 0: 8%| | 1261/14932 [01:57<21:12, 10.74it/s, v_num=o30c, train/loss=3.92\r", + "Epoch 0: 8%| | 1261/14932 [01:57<21:12, 10.74it/s, v_num=o30c, train/loss=1.79" ] }, { @@ -11185,7 +12599,8 @@ "output_type": "stream", "text": [ "\r", - "3-L12-D2048-E0_1-en 93%[=================> ] 1.50G 13.1MB/s eta 25s " + "Epoch 0: 8%| | 1262/14932 [01:57<21:12, 10.74it/s, v_num=o30c, train/loss=1.79\r", + "Epoch 0: 8%| | 1262/14932 [01:57<21:12, 10.74it/s, v_num=o30c, train/loss=4.41" ] }, { @@ -11193,7 +12608,8 @@ "output_type": "stream", "text": [ "\r", - "-L12-D2048-E0_1-enw 93%[=================> ] 1.50G 13.2MB/s eta 22s " + "Epoch 0: 8%| | 1263/14932 [01:57<21:12, 10.75it/s, v_num=o30c, train/loss=4.41\r", + "Epoch 0: 8%| | 1263/14932 [01:57<21:12, 10.75it/s, v_num=o30c, train/loss=2.89" ] }, { @@ -11201,7 +12617,8 @@ "output_type": "stream", "text": [ "\r", - "L12-D2048-E0_1-enwi 93%[=================> ] 1.50G 13.3MB/s eta 22s " + "Epoch 0: 8%| | 1264/14932 [01:57<21:11, 10.75it/s, v_num=o30c, train/loss=2.89\r", + "Epoch 0: 8%| | 1264/14932 [01:57<21:11, 10.75it/s, v_num=o30c, train/loss=3.14" ] }, { @@ -11209,7 +12626,8 @@ "output_type": "stream", "text": [ "\r", - "12-D2048-E0_1-enwik 94%[=================> ] 1.51G 13.4MB/s eta 22s " + "Epoch 0: 8%| | 1265/14932 [01:57<21:11, 10.75it/s, v_num=o30c, train/loss=3.14\r", + "Epoch 0: 8%| | 1265/14932 [01:57<21:11, 10.75it/s, v_num=o30c, train/loss=2.17" ] }, { @@ -11217,7 +12635,8 @@ "output_type": "stream", "text": [ "\r", - "2-D2048-E0_1-enwiki 94%[=================> ] 1.51G 13.4MB/s eta 22s " + "Epoch 0: 8%| | 1266/14932 [01:57<21:11, 10.75it/s, v_num=o30c, train/loss=2.17\r", + "Epoch 0: 8%| | 1266/14932 [01:57<21:11, 10.75it/s, v_num=o30c, train/loss=2.53" ] }, { @@ -11225,7 +12644,8 @@ "output_type": "stream", "text": [ "\r", - "-D2048-E0_1-enwiki- 94%[=================> ] 1.51G 13.5MB/s eta 22s " + "Epoch 0: 8%| | 1267/14932 [01:57<21:10, 10.75it/s, v_num=o30c, train/loss=2.53\r", + "Epoch 0: 8%| | 1267/14932 [01:57<21:10, 10.75it/s, v_num=o30c, train/loss=4.06" ] }, { @@ -11233,7 +12653,8 @@ "output_type": "stream", "text": [ "\r", - "D2048-E0_1-enwiki-4 94%[=================> ] 1.52G 13.5MB/s eta 18s " + "Epoch 0: 8%| | 1268/14932 [01:57<21:10, 10.75it/s, v_num=o30c, train/loss=4.06\r", + "Epoch 0: 8%| | 1268/14932 [01:57<21:10, 10.75it/s, v_num=o30c, train/loss=2.62" ] }, { @@ -11241,7 +12662,8 @@ "output_type": "stream", "text": [ "\r", - "2048-E0_1-enwiki-4k 94%[=================> ] 1.52G 13.5MB/s eta 18s " + "Epoch 0: 8%| | 1269/14932 [01:57<21:10, 10.76it/s, v_num=o30c, train/loss=2.62\r", + "Epoch 0: 8%| | 1269/14932 [01:57<21:10, 10.76it/s, v_num=o30c, train/loss=4.00" ] }, { @@ -11249,7 +12671,8 @@ "output_type": "stream", "text": [ "\r", - "048-E0_1-enwiki-4k. 95%[==================> ] 1.52G 13.6MB/s eta 18s " + "Epoch 0: 9%| | 1270/14932 [01:58<21:09, 10.76it/s, v_num=o30c, train/loss=4.00\r", + "Epoch 0: 9%| | 1270/14932 [01:58<21:09, 10.76it/s, v_num=o30c, train/loss=2.36" ] }, { @@ -11257,7 +12680,8 @@ "output_type": "stream", "text": [ "\r", - "48-E0_1-enwiki-4k.p 95%[==================> ] 1.53G 13.6MB/s eta 18s " + "Epoch 0: 9%| | 1271/14932 [01:58<21:10, 10.75it/s, v_num=o30c, train/loss=2.36\r", + "Epoch 0: 9%| | 1271/14932 [01:58<21:10, 10.75it/s, v_num=o30c, train/loss=4.22" ] }, { @@ -11265,7 +12689,8 @@ "output_type": "stream", "text": [ "\r", - "8-E0_1-enwiki-4k.pt 95%[==================> ] 1.53G 13.6MB/s eta 18s " + "Epoch 0: 9%| | 1272/14932 [01:58<21:10, 10.75it/s, v_num=o30c, train/loss=4.22\r", + "Epoch 0: 9%| | 1272/14932 [01:58<21:10, 10.75it/s, v_num=o30c, train/loss=3.59" ] }, { @@ -11273,7 +12698,8 @@ "output_type": "stream", "text": [ "\r", - "-E0_1-enwiki-4k.pth 95%[==================> ] 1.53G 13.6MB/s eta 15s " + "Epoch 0: 9%| | 1273/14932 [01:58<21:09, 10.76it/s, v_num=o30c, train/loss=3.59\r", + "Epoch 0: 9%| | 1273/14932 [01:58<21:09, 10.76it/s, v_num=o30c, train/loss=1.73" ] }, { @@ -11281,7 +12707,8 @@ "output_type": "stream", "text": [ "\r", - "E0_1-enwiki-4k.pth 95%[==================> ] 1.54G 13.5MB/s eta 15s " + "Epoch 0: 9%| | 1274/14932 [01:58<21:09, 10.76it/s, v_num=o30c, train/loss=1.73\r", + "Epoch 0: 9%| | 1274/14932 [01:58<21:09, 10.76it/s, v_num=o30c, train/loss=3.69" ] }, { @@ -11289,7 +12716,8 @@ "output_type": "stream", "text": [ "\r", - "0_1-enwiki-4k.pth 96%[==================> ] 1.54G 13.4MB/s eta 15s " + "Epoch 0: 9%| | 1275/14932 [01:58<21:09, 10.76it/s, v_num=o30c, train/loss=3.69\r", + "Epoch 0: 9%| | 1275/14932 [01:58<21:09, 10.76it/s, v_num=o30c, train/loss=2.91" ] }, { @@ -11297,7 +12725,8 @@ "output_type": "stream", "text": [ "\r", - "_1-enwiki-4k.pth 96%[==================> ] 1.54G 13.4MB/s eta 15s " + "Epoch 0: 9%| | 1276/14932 [01:58<21:09, 10.76it/s, v_num=o30c, train/loss=2.91\r", + "Epoch 0: 9%| | 1276/14932 [01:58<21:09, 10.76it/s, v_num=o30c, train/loss=3.44" ] }, { @@ -11305,7 +12734,8 @@ "output_type": "stream", "text": [ "\r", - "1-enwiki-4k.pth 96%[==================> ] 1.55G 14.1MB/s eta 15s " + "Epoch 0: 9%| | 1277/14932 [01:58<21:09, 10.76it/s, v_num=o30c, train/loss=3.44\r", + "Epoch 0: 9%| | 1277/14932 [01:58<21:09, 10.76it/s, v_num=o30c, train/loss=3.42" ] }, { @@ -11313,7 +12743,8 @@ "output_type": "stream", "text": [ "\r", - "-enwiki-4k.pth 96%[==================> ] 1.55G 12.9MB/s eta 11s " + "Epoch 0: 9%| | 1278/14932 [01:58<21:08, 10.76it/s, v_num=o30c, train/loss=3.42\r", + "Epoch 0: 9%| | 1278/14932 [01:58<21:08, 10.76it/s, v_num=o30c, train/loss=2.47" ] }, { @@ -11321,7 +12752,8 @@ "output_type": "stream", "text": [ "\r", - "enwiki-4k.pth 96%[==================> ] 1.55G 13.1MB/s eta 11s " + "Epoch 0: 9%| | 1279/14932 [01:58<21:08, 10.76it/s, v_num=o30c, train/loss=2.47\r", + "Epoch 0: 9%| | 1279/14932 [01:58<21:08, 10.76it/s, v_num=o30c, train/loss=3.31" ] }, { @@ -11329,7 +12761,8 @@ "output_type": "stream", "text": [ "\r", - "nwiki-4k.pth 97%[==================> ] 1.55G 13.0MB/s eta 11s " + "Epoch 0: 9%| | 1280/14932 [01:59<21:09, 10.75it/s, v_num=o30c, train/loss=3.31\r", + "Epoch 0: 9%| | 1280/14932 [01:59<21:09, 10.75it/s, v_num=o30c, train/loss=3.50" ] }, { @@ -11337,7 +12770,8 @@ "output_type": "stream", "text": [ "\r", - "wiki-4k.pth 97%[==================> ] 1.56G 12.9MB/s eta 11s " + "Epoch 0: 9%| | 1281/14932 [01:59<21:09, 10.76it/s, v_num=o30c, train/loss=3.50\r", + "Epoch 0: 9%| | 1281/14932 [01:59<21:09, 10.76it/s, v_num=o30c, train/loss=3.58" ] }, { @@ -11345,7 +12779,8 @@ "output_type": "stream", "text": [ "\r", - "iki-4k.pth 97%[==================> ] 1.56G 12.9MB/s eta 11s " + "Epoch 0: 9%| | 1282/14932 [01:59<21:08, 10.76it/s, v_num=o30c, train/loss=3.58\r", + "Epoch 0: 9%| | 1282/14932 [01:59<21:08, 10.76it/s, v_num=o30c, train/loss=2.69" ] }, { @@ -11353,7 +12788,8 @@ "output_type": "stream", "text": [ "\r", - "ki-4k.pth 97%[==================> ] 1.56G 12.8MB/s eta 8s " + "Epoch 0: 9%| | 1283/14932 [01:59<21:08, 10.76it/s, v_num=o30c, train/loss=2.69\r", + "Epoch 0: 9%| | 1283/14932 [01:59<21:08, 10.76it/s, v_num=o30c, train/loss=2.92" ] }, { @@ -11361,7 +12797,8 @@ "output_type": "stream", "text": [ "\r", - "i-4k.pth 97%[==================> ] 1.57G 12.7MB/s eta 8s " + "Epoch 0: 9%| | 1284/14932 [01:59<21:08, 10.76it/s, v_num=o30c, train/loss=2.92\r", + "Epoch 0: 9%| | 1284/14932 [01:59<21:08, 10.76it/s, v_num=o30c, train/loss=2.89" ] }, { @@ -11369,7 +12806,8 @@ "output_type": "stream", "text": [ "\r", - "-4k.pth 97%[==================> ] 1.57G 12.6MB/s eta 8s " + "Epoch 0: 9%| | 1285/14932 [01:59<21:08, 10.76it/s, v_num=o30c, train/loss=2.89\r", + "Epoch 0: 9%| | 1285/14932 [01:59<21:08, 10.76it/s, v_num=o30c, train/loss=2.95" ] }, { @@ -11377,7 +12815,8 @@ "output_type": "stream", "text": [ "\r", - "4k.pth 97%[==================> ] 1.57G 11.9MB/s eta 8s " + "Epoch 0: 9%| | 1286/14932 [01:59<21:07, 10.76it/s, v_num=o30c, train/loss=2.95\r", + "Epoch 0: 9%| | 1286/14932 [01:59<21:07, 10.76it/s, v_num=o30c, train/loss=3.64" ] }, { @@ -11385,7 +12824,8 @@ "output_type": "stream", "text": [ "\r", - "k.pth 98%[==================> ] 1.57G 11.9MB/s eta 6s " + "Epoch 0: 9%| | 1287/14932 [01:59<21:07, 10.77it/s, v_num=o30c, train/loss=3.64\r", + "Epoch 0: 9%| | 1287/14932 [01:59<21:07, 10.77it/s, v_num=o30c, train/loss=3.03" ] }, { @@ -11393,7 +12833,8 @@ "output_type": "stream", "text": [ "\r", - ".pth 98%[==================> ] 1.57G 10.8MB/s eta 6s " + "Epoch 0: 9%| | 1288/14932 [01:59<21:07, 10.77it/s, v_num=o30c, train/loss=3.03\r", + "Epoch 0: 9%| | 1288/14932 [01:59<21:07, 10.77it/s, v_num=o30c, train/loss=3.64" ] }, { @@ -11401,7 +12842,8 @@ "output_type": "stream", "text": [ "\r", - "pth 98%[==================> ] 1.58G 10.8MB/s eta 6s " + "Epoch 0: 9%| | 1289/14932 [01:59<21:06, 10.77it/s, v_num=o30c, train/loss=3.64\r", + "Epoch 0: 9%| | 1289/14932 [01:59<21:06, 10.77it/s, v_num=o30c, train/loss=4.16" ] }, { @@ -11409,7 +12851,8 @@ "output_type": "stream", "text": [ "\r", - "th 98%[==================> ] 1.58G 10.4MB/s eta 6s " + "Epoch 0: 9%| | 1290/14932 [01:59<21:06, 10.77it/s, v_num=o30c, train/loss=4.16\r", + "Epoch 0: 9%| | 1290/14932 [01:59<21:06, 10.77it/s, v_num=o30c, train/loss=3.34" ] }, { @@ -11417,7 +12860,8 @@ "output_type": "stream", "text": [ "\r", - "h 98%[==================> ] 1.58G 9.87MB/s eta 5s " + "Epoch 0: 9%| | 1291/14932 [01:59<21:06, 10.77it/s, v_num=o30c, train/loss=3.34\r", + "Epoch 0: 9%| | 1291/14932 [01:59<21:06, 10.77it/s, v_num=o30c, train/loss=4.47" ] }, { @@ -11425,7 +12869,8 @@ "output_type": "stream", "text": [ "\r", - " 98%[==================> ] 1.58G 9.45MB/s eta 5s " + "Epoch 0: 9%| | 1292/14932 [01:59<21:06, 10.77it/s, v_num=o30c, train/loss=4.47\r", + "Epoch 0: 9%| | 1292/14932 [01:59<21:06, 10.77it/s, v_num=o30c, train/loss=3.30" ] }, { @@ -11433,7 +12878,8 @@ "output_type": "stream", "text": [ "\r", - " v 98%[==================> ] 1.58G 9.12MB/s eta 5s " + "Epoch 0: 9%| | 1293/14932 [01:59<21:05, 10.78it/s, v_num=o30c, train/loss=3.30\r", + "Epoch 0: 9%| | 1293/14932 [01:59<21:05, 10.78it/s, v_num=o30c, train/loss=3.16" ] }, { @@ -11441,7 +12887,8 @@ "output_type": "stream", "text": [ "\r", - " v5 98%[==================> ] 1.58G 9.15MB/s eta 5s " + "Epoch 0: 9%| | 1294/14932 [02:00<21:05, 10.78it/s, v_num=o30c, train/loss=3.16\r", + "Epoch 0: 9%| | 1294/14932 [02:00<21:05, 10.78it/s, v_num=o30c, train/loss=4.22" ] }, { @@ -11449,7 +12896,8 @@ "output_type": "stream", "text": [ "\r", - " v5r 98%[==================> ] 1.59G 8.76MB/s eta 5s " + "Epoch 0: 9%| | 1295/14932 [02:00<21:05, 10.78it/s, v_num=o30c, train/loss=4.22\r", + "Epoch 0: 9%| | 1295/14932 [02:00<21:05, 10.78it/s, v_num=o30c, train/loss=3.12" ] }, { @@ -11457,7 +12905,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3 99%[==================> ] 1.59G 8.35MB/s eta 3s " + "Epoch 0: 9%| | 1296/14932 [02:00<21:04, 10.78it/s, v_num=o30c, train/loss=3.12\r", + "Epoch 0: 9%| | 1296/14932 [02:00<21:04, 10.78it/s, v_num=o30c, train/loss=3.91" ] }, { @@ -11465,7 +12914,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3- 99%[==================> ] 1.59G 8.02MB/s eta 3s " + "Epoch 0: 9%| | 1297/14932 [02:00<21:04, 10.78it/s, v_num=o30c, train/loss=3.91\r", + "Epoch 0: 9%| | 1297/14932 [02:00<21:04, 10.78it/s, v_num=o30c, train/loss=3.12" ] }, { @@ -11473,7 +12923,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L 99%[==================> ] 1.59G 8.47MB/s eta 3s " + "Epoch 0: 9%| | 1298/14932 [02:00<21:04, 10.78it/s, v_num=o30c, train/loss=3.12\r", + "Epoch 0: 9%| | 1298/14932 [02:00<21:04, 10.78it/s, v_num=o30c, train/loss=3.61" ] }, { @@ -11481,7 +12932,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L1 99%[==================> ] 1.59G 8.08MB/s eta 3s " + "Epoch 0: 9%| | 1299/14932 [02:00<21:04, 10.78it/s, v_num=o30c, train/loss=3.61\r", + "Epoch 0: 9%| | 1299/14932 [02:00<21:04, 10.78it/s, v_num=o30c, train/loss=2.92" ] }, { @@ -11489,7 +12941,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12 99%[==================> ] 1.59G 7.88MB/s eta 3s " + "Epoch 0: 9%| | 1300/14932 [02:00<21:03, 10.79it/s, v_num=o30c, train/loss=2.92\r", + "Epoch 0: 9%| | 1300/14932 [02:00<21:03, 10.79it/s, v_num=o30c, train/loss=3.28" ] }, { @@ -11497,7 +12950,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12- 99%[==================> ] 1.60G 7.72MB/s eta 1s " + "Epoch 0: 9%| | 1301/14932 [02:00<21:03, 10.79it/s, v_num=o30c, train/loss=3.28\r", + "Epoch 0: 9%| | 1301/14932 [02:00<21:03, 10.79it/s, v_num=o30c, train/loss=3.64" ] }, { @@ -11505,7 +12959,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D 99%[==================> ] 1.60G 7.50MB/s eta 1s " + "Epoch 0: 9%| | 1302/14932 [02:00<21:03, 10.79it/s, v_num=o30c, train/loss=3.64\r", + "Epoch 0: 9%| | 1302/14932 [02:00<21:03, 10.79it/s, v_num=o30c, train/loss=4.56" ] }, { @@ -11513,7 +12968,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D2 99%[==================> ] 1.60G 7.28MB/s eta 1s " + "Epoch 0: 9%| | 1303/14932 [02:00<21:03, 10.79it/s, v_num=o30c, train/loss=4.56\r", + "Epoch 0: 9%| | 1303/14932 [02:00<21:03, 10.79it/s, v_num=o30c, train/loss=2.55" ] }, { @@ -11521,7 +12977,8 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D20 99%[==================> ] 1.60G 7.09MB/s eta 1s " + "Epoch 0: 9%| | 1304/14932 [02:00<21:02, 10.79it/s, v_num=o30c, train/loss=2.55\r", + "Epoch 0: 9%| | 1304/14932 [02:00<21:02, 10.79it/s, v_num=o30c, train/loss=3.39" ] }, { @@ -11529,79 +12986,42 @@ "output_type": "stream", "text": [ "\r", - " v5r3-L12-D204 99%[==================> ] 1.60G 6.86MB/s eta 1s \r", - "v5r3-L12-D2048-E0_1 100%[===================>] 1.60G 6.89MB/s in 5m 34s \r\n", - "\r\n", - "2023-09-06 16:27:17 (4.91 MB/s) - ‘v5r3-L12-D2048-E0_1-enwiki-4k.pth’ saved [1721187013/1721187013]\r\n", - "\r\n" + "Epoch 0: 9%| | 1305/14932 [02:00<21:02, 10.79it/s, v_num=o30c, train/loss=3.39\r", + "Epoch 0: 9%| | 1305/14932 [02:00<21:02, 10.79it/s, v_num=o30c, train/loss=3.75" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "total 1.6G\r\n", - "drwxr-xr-x 2 root root 3 Sep 6 16:21 .\r\n", - "drwxr-xr-x 20 root root 24 Sep 6 16:21 ..\r\n", - "-rw-r--r-- 1 root root 1.7G Sep 6 15:04 v5r3-L12-D2048-E0_1-enwiki-4k.pth\r\n" + "\r", + "Epoch 0: 9%| | 1306/14932 [02:00<21:02, 10.80it/s, v_num=o30c, train/loss=3.75\r", + "Epoch 0: 9%| | 1306/14932 [02:00<21:02, 10.80it/s, v_num=o30c, train/loss=2.94" ] - } - ], - "source": [ - "# Download the model directly (stop gap till HF sync issues is resolved)\n", - "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", - " wget -nc \"https://huggingface.co/rwkv-x-dev/rwkv-x-playground/resolve/main/experiment/rwkv-x-exp/v5-r3-memory/{DIR_NAME}/{FILENAME_PREFIX}-enwiki-4k.pth\"\n", - "\n", - "!cd \"{TRAINER_DIR}\" && cd \"../model/\" && \\\n", - " ls -alh ." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "5176ef5b", - "metadata": { - "papermill": { - "duration": 0.051299, - "end_time": "2023-09-06T16:27:17.447337", - "exception": false, - "start_time": "2023-09-06T16:27:17.396038", - "status": "completed" }, - "tags": [] - }, - "source": [ - "# Enwiki Stage 2 : Basic Instruct Tuning" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d5c54c51", - "metadata": { - "execution": { - "iopub.execute_input": "2023-09-06T16:27:17.550805Z", - "iopub.status.busy": "2023-09-06T16:27:17.550559Z", - "iopub.status.idle": "2023-09-06T16:27:32.626011Z", - "shell.execute_reply": "2023-09-06T16:27:32.625244Z" + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Epoch 0: 9%| | 1307/14932 [02:01<21:01, 10.80it/s, v_num=o30c, train/loss=2.94" + ] }, - "papermill": { - "duration": 15.129219, - "end_time": "2023-09-06T16:27:32.627774", - "exception": false, - "start_time": "2023-09-06T16:27:17.498555", - "status": "completed" + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Epoch 0: 9%| | 1307/14932 [02:01<21:01, 10.80it/s, v_num=o30c, train/loss=3.77" + ] }, - "tags": [] - }, - "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r", - "Downloading readme: 0%| | 0.00/7.79k [00:00=12.1), as this is known to have freeze issues\r\n", - "# - The terms used in wandb / the progress bar can be confusing, see the github README.md for beter clarifications\r\n", - "# - When resuming from checkpoint, the estimated time is inaccurate\r\n", - "#\r\n", - "\r\n", - "[RWKV.model] Configuring optimizer with\r\n", - " - lr_init: 4.000e-04 (0.0004)\r\n", - " - lr_final: 3.000e-04 (0.0003)\r\n", - "\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\r\n", - "Detected CUDA files, patching ldflags\r\n", - "Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/fused_adam/build.ninja...\r\n", - "Building extension module fused_adam...\r\n", - "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ninja: no work to do.\r\n", - "Loading extension module fused_adam...\r\n", - "Time to load fused_adam op: 0.07197451591491699 seconds\r\n", - "Loading `train_dataloader` to estimate number of stepping batches.\r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rank: 0 partition count [1, 1] and sizes[(860549120, False), (768, False)] \r\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r\n", - " | Name | Type | Params\r\n", - "--------------------------------------\r\n", - "0 | emb | Embedding | 102 M \r\n", - "1 | blocks | ModuleList | 654 M \r\n", - "2 | ln_out | LayerNorm | 4.1 K \r\n", - "3 | head | Linear | 102 M \r\n", - "--------------------------------------\r\n", - "860 M Trainable params\r\n", - "0 Non-trainable params\r\n", - "860 M Total params\r\n", - "3,442.200 Total estimated model params size (MB)\r\n" + "Epoch 0: 9%| | 1333/14932 [02:03<20:56, 10.82it/s, v_num=o30c, train/loss=1.52\r", + "Epoch 0: 9%| | 1333/14932 [02:03<20:56, 10.82it/s, v_num=o30c, train/loss=2.34" ] }, { @@ -11989,9 +13261,8 @@ "output_type": "stream", "text": [ "\r", - "Training: 0it [00:00, ?it/s]\r", - "Training: 0%| | 0/14932 [00:00\r\n", + " cli_main()\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 233, in cli_main\r\n", + " LightningCLI(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 353, in __init__\r\n", + " self._run_subcommand(self.subcommand)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 642, in _run_subcommand\r\n", + " fn(**fn_kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 529, in fit\r\n", + " call._call_and_handle_interrupt(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 41, in _call_and_handle_interrupt\r\n", + " return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/strategies/launchers/subprocess_script.py\", line 91, in launch\r\n", + " return function(*args, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 568, in _fit_impl\r\n", + " self._run(model, ckpt_path=ckpt_path)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/trainer.py\", line 925, in _run\r\n", + " self._data_connector.prepare_data()\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py\", line 94, in prepare_data\r\n", + " call._call_lightning_datamodule_hook(trainer, \"prepare_data\")\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/trainer/call.py\", line 164, in _call_lightning_datamodule_hook\r\n", + " return fn(*args, **kwargs)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/data.py\", line 542, in prepare_data\r\n", + " prepare_data_static(**self._init_locals)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/data.py\", line 101, in prepare_data_static\r\n", + " src_dataset = load_dataset(**load_dataset_params)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 2112, in load_dataset\r\n", + " builder_instance = load_dataset_builder(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 1798, in load_dataset_builder\r\n", + " dataset_module = dataset_module_factory(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 1413, in dataset_module_factory\r\n", + " ).get_module()\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/datasets/load.py\", line 948, in get_module\r\n", + " patterns = sanitize_patterns(self.data_files) if self.data_files is not None else get_data_patterns(base_path)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/datasets/data_files.py\", line 459, in get_data_patterns\r\n", + " raise EmptyDatasetError(f\"The directory at {base_path} doesn't contain any data files\") from None\r\n", + "datasets.data_files.EmptyDatasetError: The directory at /actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/dataset doesn't contain any data files\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mv5r3-L12-D2048-E0.1 - Mem-Instruct (train-ctx=512, deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments/runs/jm2b2y5r\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjkzMjg5ODA3/version_details/v24\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20230906_173654-jm2b2y5r/logs\u001b[0m\r\n" ] + } + ], + "source": [ + "# Start the finetune model training\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", + " python3 lightning_trainer.py fit \\\n", + " -c \"{CONFIG_DIR}/config-mem-instruct.yaml\" \\\n", + " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Instruct (train-ctx=512, {DEEPSPEED_STRAT})\" \\\n", + " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", + " --trainer.devices=\"{GPU_DEVICES}\" \\\n", + " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-instruct/\" \\\n", + " --model.load_model=\"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \\\n", + " --model.ctx_len=512 \\\n", + " --model.bptt_learning_range=1" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1aa1f08c", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:37:15.869868Z", + "iopub.status.busy": "2023-09-06T17:37:15.869574Z", + "iopub.status.idle": "2023-09-06T17:37:18.336322Z", + "shell.execute_reply": "2023-09-06T17:37:18.335570Z" }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " or" - ] + "papermill": { + "duration": 3.092432, + "end_time": "2023-09-06T17:37:18.338136", + "exception": false, + "start_time": "2023-09-06T17:37:15.245704", + "status": "completed" }, + "tags": [] + }, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " a" + "[2023-09-06 17:37:17,473] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - " dragon" + "Traceback (most recent call last):\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in \r\n", + " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n", + " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n", + " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n", + "ValueError: Unable to find 'latest' file at ../checkpoint/v5r3-L12-D2048-E0_1-mem-instruct/last.ckpt/latest\r\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "." + "ls: cannot access '../model/v5r3-L12-D2048-E0_1-mem-instruct.pth': No such file or directory\r\n" ] + } + ], + "source": [ + "# Lets export the model from the checkpoint\n", + "!cd \"{TRAINER_DIR}\" && \\\n", + " python3 export_checkpoint.py \\\n", + " \"../checkpoint/{FILENAME_PREFIX}-mem-instruct/last.ckpt\" \\\n", + " \"../model/{FILENAME_PREFIX}-mem-instruct.pth\" \"bf16\"\n", + "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-instruct.pth\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "07518561", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:37:19.555489Z", + "iopub.status.busy": "2023-09-06T17:37:19.555305Z", + "iopub.status.idle": "2023-09-06T17:37:19.787874Z", + "shell.execute_reply": "2023-09-06T17:37:19.787184Z" }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\r\n" - ] + "papermill": { + "duration": 0.823967, + "end_time": "2023-09-06T17:37:19.789695", + "exception": false, + "start_time": "2023-09-06T17:37:18.965728", + "status": "completed" }, + "tags": [] + }, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "8" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/eval_v5_memory_guided.py': [Errno 2] No such file or directory\r\n" ] + } + ], + "source": [ + "# Lets do a quick memory test\n", + "!python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-instruct.pth\"" + ] + }, + { + "cell_type": "markdown", + "id": "42fec908", + "metadata": { + "papermill": { + "duration": 0.633567, + "end_time": "2023-09-06T17:37:21.058555", + "exception": false, + "start_time": "2023-09-06T17:37:20.424988", + "status": "completed" }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "." - ] + "tags": [] + }, + "source": [ + "## Tune 2 : Low ctx size (512), memory training\n", + "\n", + "- Tune 2: Low ctx size (512), Training with instruction & input masked. This forces the actual memory training on the output tokens." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "577fea20", + "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:37:22.293582Z", + "iopub.status.busy": "2023-09-06T17:37:22.293253Z", + "iopub.status.idle": "2023-09-06T17:37:22.339491Z", + "shell.execute_reply": "2023-09-06T17:37:22.339111Z" }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " Have" - ] + "papermill": { + "duration": 0.679609, + "end_time": "2023-09-06T17:37:22.341670", + "exception": false, + "start_time": "2023-09-06T17:37:21.662061", + "status": "completed" }, + "tags": [] + }, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " a" + "## Generating word reptition dataset ##\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " secret" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " to" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " stay" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " in" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " contact" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " with" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " one" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " another" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "." + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "\r\n" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "10" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "." + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " Be" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " loyal" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " to" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " your" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " friends" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " and" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " house" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "." + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "\r\n" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "9" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "." + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " Security" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " and" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - " workout" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "." + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "\r\n" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/gen_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "10" + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/shuffle_limited_prompt_completion_jsonl.py': [Errno 2] No such file or directory\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "." + "## Done ##\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - " You" + "total 10K\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - " could" + "drwxr-xr-x 2 root root 2 Sep 6 17:36 .\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "\r\n" - ] - } - ], - "source": [ - "# # Lets do a quick dragon prompt validation\n", - "!cd \"{INFERENCE_DIR}\" && \\\n", - " python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \"cuda fp32\"" - ] - }, - { - "cell_type": "markdown", - "id": "e8720f96", - "metadata": { - "tags": [ - "papermill-error-cell-tag" - ] - }, - "source": [ - "Execution using papermill encountered an exception here and stopped:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "c6f152ff", - "metadata": { - "execution": { - "iopub.execute_input": "2023-09-06T16:51:06.543089Z", - "iopub.status.busy": "2023-09-06T16:51:06.542771Z", - "iopub.status.idle": "2023-09-06T16:51:06.548302Z", - "shell.execute_reply": "2023-09-06T16:51:06.547314Z" - }, - "papermill": { - "duration": 0.694033, - "end_time": "2023-09-06T16:51:06.549477", - "exception": true, - "start_time": "2023-09-06T16:51:05.855444", - "status": "failed" - }, - "tags": [] - }, - "outputs": [ - { - "ename": "IndentationError", - "evalue": "unexpected indent (598249315.py, line 3)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m Cell \u001b[0;32mIn[9], line 3\u001b[0;36m\u001b[0m\n\u001b[0;31m python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-instruct.pth\"\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unexpected indent\n" + "drwxr-xr-x 6 root root 11 Sep 6 17:36 ..\n" ] } ], - "source": [ - "# Lets do a quick memory test\n", - "!python3\n", - " python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-enwiki-instruct.pth\"" - ] - }, - { - "cell_type": "markdown", - "id": "30ba2692", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "source": [ - "## Tune 1 : Simple Memory instruct finetuning\n", - "\n", - "- Tune 1: Low ctx size (512), Training with only the input masked. This does very limited memory training, and is used primarily to train the instruction set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4b49799", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "%%script bash\n", - "\n", - "########################################\n", - "# Generate the required jsonl dataset\n", - "########################################\n", - "\n", - "# Reset the dataset dir\n", - "mkdir -p ../dataset\n", - "rm -rf ../dataset/*.jsonl\n", - "\n", - "# Generate the various datasets\n", - "echo \"## Generating word reptition dataset ##\"\n", - "\n", - "# We do a strong bias for smaller word count, to teach the concept from scratch\n", - "# so that the model can learn the function. \n", - "#\n", - "# Note that all document samples, are randomized between the target word count, \n", - "# to half of the target word count.\n", - "python3 ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-2-count.jsonl 2 5000 &\n", - "python3 ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-5-count.jsonl 5 5000 &\n", - "python3 ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-10-count.jsonl 10 2500 &\n", - "python3 ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-15-count.jsonl 15 2500 &\n", - "python3 ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-20-count.jsonl 20 2500 &\n", - "python3 ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-25-count.jsonl 25 2500 &\n", - "python3 ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-40-count.jsonl 40 2500 &\n", - "python3 ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-50-count.jsonl 50 2500 &\n", - "python3 ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-60-count.jsonl 80 2500 &\n", - "python3 ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-80-count.jsonl 80 2500 &\n", - "\n", - "# With a slight mix of the larger word count\n", - "python3 ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-100-count.jsonl 100 2500 &\n", - "python3 ../memory_script/gen_limited_segmented_jsonl.py ../dataset/word-200-count.jsonl 200 2500 &\n", - "\n", - "wait\n", - "echo \"## Done ##\"\n", - "\n", - "ls -alh ../dataset/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f45649ff", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# Start the finetune model training\n", - "!cd \"{TRAINER_DIR}\" && \\\n", - " export WANDB_MODE=\"{WANDB_MODE}\" && \\\n", - " python3 lightning_trainer.py fit \\\n", - " -c \"{CONFIG_DIR}/config-mem-instruct.yaml\" \\\n", - " --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Mem-Instruct (train-ctx=512, {DEEPSPEED_STRAT})\" \\\n", - " --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n", - " --trainer.devices=\"{GPU_DEVICES}\" \\\n", - " --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-instruct/\" \\\n", - " --model.load_model=\"../model/{FILENAME_PREFIX}-enwiki-instruct.pth\" \\\n", - " --model.ctx_len=512 \\\n", - " --model.bptt_learning_range=1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eee1dae8", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# Lets export the model from the checkpoint\n", - "!cd \"{TRAINER_DIR}\" && \\\n", - " python3 export_checkpoint.py \\\n", - " \"../checkpoint/{FILENAME_PREFIX}-mem-instruct/last.ckpt\" \\\n", - " \"../model/{FILENAME_PREFIX}-mem-instruct.pth\" \"bf16\"\n", - "!cd \"{TRAINER_DIR}\" && ls -alh \"../model/{FILENAME_PREFIX}-mem-instruct.pth\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "77c46f2c", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# Lets do a quick memory test\n", - "!python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-instruct.pth\"" - ] - }, - { - "cell_type": "markdown", - "id": "afe593cc", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "source": [ - "## Tune 2 : Low ctx size (512), memory training\n", - "\n", - "- Tune 2: Low ctx size (512), Training with instruction & input masked. This forces the actual memory training on the output tokens." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e27a0ba5", - "metadata": { - "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" - }, - "tags": [] - }, - "outputs": [], "source": [ "%%script bash\n", "\n", @@ -150557,19 +140239,109 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "71899f60", + "execution_count": 15, + "id": "5928163b", "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:37:23.574211Z", + "iopub.status.busy": "2023-09-06T17:37:23.573930Z", + "iopub.status.idle": "2023-09-06T17:37:41.229665Z", + "shell.execute_reply": "2023-09-06T17:37:41.228859Z" + }, "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" + "duration": 18.288664, + "end_time": "2023-09-06T17:37:41.231690", + "exception": false, + "start_time": "2023-09-06T17:37:22.943026", + "status": "completed" }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-09-06 17:37:26,449] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py:484: UserWarning: LightningCLI's args parameter is intended to run from within Python like if it were from the command line. To prevent mistakes it is not recommended to provide both args and command line arguments, got: sys.argv[1:]=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/config-mem-template.yaml', '--trainer.logger.init_args.name=v5r3-L12-D2048-E0.1 - Mem-Tune ctx-512 (train-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5r3-L12-D2048-E0_1-mem-ctx-512/', '--model.lr_init=5e-4', '--model.lr_final=4e-4', '--data.max_token_size=512', '--model.ctx_len=512', '--model.bptt_learning_range=1', '--model.load_model=../model/v5r3-L12-D2048-E0_1-mem-instruct.pth'], args=['fit', '-c', '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/config-mem-template.yaml', '--trainer.logger.init_args.name=v5r3-L12-D2048-E0.1 - Mem-Tune ctx-512 (train-ctx=512, deepspeed_stage_1)', '--trainer.strategy=deepspeed_stage_1', '--trainer.devices=auto', '--trainer.callbacks.init_args.dirpath=../checkpoint/v5r3-L12-D2048-E0_1-mem-ctx-512/', '--model.lr_init=5e-4', '--model.lr_final=4e-4', '--data.max_token_size=512', '--model.ctx_len=512', '--model.bptt_learning_range=1', '--model.load_model=../model/v5r3-L12-D2048-E0_1-mem-instruct.pth'].\r\n", + " rank_zero_warn(\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/lightning/fabric/utilities/seed.py:39: UserWarning: No seed found, seed set to 4258540337\r\n", + " rank_zero_warn(f\"No seed found, seed set to {seed}\")\r\n", + "Global seed set to 4258540337\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mpicocreator\u001b[0m (\u001b[33mrwkv-x-dev\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.9\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m./wandb/run-20230906_173728-uklb27ld\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mv5r3-L12-D2048-E0.1 - Mem-Tune ctx-512 (train-ctx=512, deepspeed_stage_1)\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments/runs/uklb27ld\u001b[0m\r\n", + "Traceback (most recent call last):\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 254, in \r\n", + " cli_main()\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/lightning_trainer.py\", line 233, in cli_main\r\n", + " LightningCLI(\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 350, in __init__\r\n", + " self.instantiate_classes()\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/lightning/pytorch/cli.py\", line 499, in instantiate_classes\r\n", + " self.config_init = self.parser.instantiate_classes(self.config)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n", + " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1130, in instantiate_classes\r\n", + " cfg[subcommand] = subparser.instantiate_classes(cfg[subcommand], instantiate_groups=instantiate_groups)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_deprecated.py\", line 139, in patched_instantiate_classes\r\n", + " cfg = self._unpatched_instantiate_classes(cfg, **kwargs)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_core.py\", line 1124, in instantiate_classes\r\n", + " component.instantiate_class(component, cfg)\r\n", + " File \"/usr/local/lib/python3.10/dist-packages/jsonargparse/_signatures.py\", line 561, in group_instantiate_class\r\n", + " parent[key] = group.group_class(**value)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 559, in __init__\r\n", + " raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n", + "ValueError: load_model file '../model/v5r3-L12-D2048-E0_1-mem-instruct.pth' does not exist\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[31m(failed 1).\u001b[0m Press Control-C to abort syncing.\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mv5r3-L12-D2048-E0.1 - Mem-Tune ctx-512 (train-ctx=512, deepspeed_stage_1)\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments/runs/uklb27ld\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/rwkv-x-dev/RWKV-X-Experiments/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjkzMjg5ODA3/version_details/v25\u001b[0m\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\r\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20230906_173728-uklb27ld/logs\u001b[0m\r\n" + ] + } + ], "source": [ "# Start the finetune model training\n", "!cd \"{TRAINER_DIR}\" && \\\n", @@ -150590,19 +140362,54 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "29773d32", + "execution_count": 16, + "id": "3c4e1a84", "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:37:42.501242Z", + "iopub.status.busy": "2023-09-06T17:37:42.500756Z", + "iopub.status.idle": "2023-09-06T17:37:44.926179Z", + "shell.execute_reply": "2023-09-06T17:37:44.925416Z" + }, "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" + "duration": 3.062059, + "end_time": "2023-09-06T17:37:44.927885", + "exception": false, + "start_time": "2023-09-06T17:37:41.865826", + "status": "completed" }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2023-09-06 17:37:44,062] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 651, in \r\n", + " convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, output_file, save_dtype=args.dtype)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 542, in convert_zero_checkpoint_to_fp32_state_dict\r\n", + " state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)\r\n", + " File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/export_checkpoint.py\", line 516, in get_fp32_state_dict_from_zero_checkpoint\r\n", + " raise ValueError(f\"Unable to find 'latest' file at {latest_path}\")\r\n", + "ValueError: Unable to find 'latest' file at ../checkpoint/v5r3-L12-D2048-E0_1-mem-ctx-512/last.ckpt/latest\r\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ls: cannot access '../model/v5r3-L12-D2048-E0_1-mem-ctx-512.pth': No such file or directory\r\n" + ] + } + ], "source": [ "# Lets export the model from the checkpoint\n", "!cd \"{TRAINER_DIR}\" && \\\n", @@ -150614,19 +140421,33 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "680bcd20", + "execution_count": 17, + "id": "ff1e2d52", "metadata": { + "execution": { + "iopub.execute_input": "2023-09-06T17:37:46.169833Z", + "iopub.status.busy": "2023-09-06T17:37:46.169341Z", + "iopub.status.idle": "2023-09-06T17:37:46.403204Z", + "shell.execute_reply": "2023-09-06T17:37:46.402355Z" + }, "papermill": { - "duration": null, - "end_time": null, - "exception": null, - "start_time": null, - "status": "pending" + "duration": 0.870871, + "end_time": "2023-09-06T17:37:46.404923", + "exception": false, + "start_time": "2023-09-06T17:37:45.534052", + "status": "completed" }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "python3: can't open file '/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/../memory_script/eval_v5_memory_guided.py': [Errno 2] No such file or directory\r\n" + ] + } + ], "source": [ "# Lets do a quick memory test\n", "!python3 ../memory_script/eval_v5_memory_guided.py \"{PROJECT_DIR}/model/{FILENAME_PREFIX}-mem-ctx-512.pth\"" @@ -150653,14 +140474,14 @@ }, "papermill": { "default_parameters": {}, - "duration": 1770.227487, - "end_time": "2023-09-06T16:51:07.320378", + "duration": 1500.445098, + "end_time": "2023-09-06T17:37:47.161793", "environment_variables": {}, - "exception": true, + "exception": null, "input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/part2.ipynb", "output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/v5-r3-memory/L12-D2048-E1e-1-ctx4k/part2.ipynb", "parameters": {}, - "start_time": "2023-09-06T16:21:37.092891", + "start_time": "2023-09-06T17:12:46.716695", "version": "2.4.0" } },