{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.935483870967742, "eval_steps": 30, "global_step": 240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008064516129032258, "grad_norm": NaN, "learning_rate": 0.0, "loss": 4.423851013183594, "num_input_tokens_seen": 3376, "step": 1, "train_runtime": 66.7001, "train_tokens_per_second": 50.615 }, { "epoch": 0.016129032258064516, "grad_norm": 6.203426361083984, "learning_rate": 0.0, "loss": 4.399973392486572, "num_input_tokens_seen": 6750, "step": 2, "train_runtime": 76.9504, "train_tokens_per_second": 87.719 }, { "epoch": 0.024193548387096774, "grad_norm": 6.417575359344482, "learning_rate": 4e-05, "loss": 4.3025593757629395, "num_input_tokens_seen": 10132, "step": 3, "train_runtime": 83.942, "train_tokens_per_second": 120.702 }, { "epoch": 0.03225806451612903, "grad_norm": 5.7837066650390625, "learning_rate": 8e-05, "loss": 4.235351085662842, "num_input_tokens_seen": 13496, "step": 4, "train_runtime": 90.8369, "train_tokens_per_second": 148.574 }, { "epoch": 0.04032258064516129, "grad_norm": 6.007510662078857, "learning_rate": 0.00012, "loss": 3.714757204055786, "num_input_tokens_seen": 16978, "step": 5, "train_runtime": 97.7335, "train_tokens_per_second": 173.717 }, { "epoch": 0.04838709677419355, "grad_norm": 5.7335405349731445, "learning_rate": 0.00016, "loss": 2.894195795059204, "num_input_tokens_seen": 20170, "step": 6, "train_runtime": 104.1089, "train_tokens_per_second": 193.739 }, { "epoch": 0.056451612903225805, "grad_norm": 3.857361078262329, "learning_rate": 0.0002, "loss": 2.3366570472717285, "num_input_tokens_seen": 23566, "step": 7, "train_runtime": 110.8035, "train_tokens_per_second": 212.683 }, { "epoch": 0.06451612903225806, "grad_norm": 3.2568986415863037, "learning_rate": 0.00019999967645432384, "loss": 1.862092137336731, "num_input_tokens_seen": 26930, "step": 8, "train_runtime": 117.3062, "train_tokens_per_second": 229.57 }, { "epoch": 0.07258064516129033, "grad_norm": 2.912194013595581, "learning_rate": 0.00019999870581938894, "loss": 1.5545849800109863, "num_input_tokens_seen": 30108, "step": 9, "train_runtime": 123.6618, "train_tokens_per_second": 243.47 }, { "epoch": 0.08064516129032258, "grad_norm": 2.0680058002471924, "learning_rate": 0.0001999970881014762, "loss": 1.4453465938568115, "num_input_tokens_seen": 33254, "step": 10, "train_runtime": 129.8936, "train_tokens_per_second": 256.01 }, { "epoch": 0.08870967741935484, "grad_norm": 1.753443717956543, "learning_rate": 0.00019999482331105377, "loss": 1.3031213283538818, "num_input_tokens_seen": 36458, "step": 11, "train_runtime": 136.2063, "train_tokens_per_second": 267.667 }, { "epoch": 0.0967741935483871, "grad_norm": 1.311978816986084, "learning_rate": 0.0001999919114627769, "loss": 1.155328631401062, "num_input_tokens_seen": 39848, "step": 12, "train_runtime": 142.7896, "train_tokens_per_second": 279.068 }, { "epoch": 0.10483870967741936, "grad_norm": 1.408735990524292, "learning_rate": 0.00019998835257548786, "loss": 1.1732935905456543, "num_input_tokens_seen": 42964, "step": 13, "train_runtime": 149.0986, "train_tokens_per_second": 288.158 }, { "epoch": 0.11290322580645161, "grad_norm": 1.0793702602386475, "learning_rate": 0.00019998414667221596, "loss": 1.071805715560913, "num_input_tokens_seen": 46336, "step": 14, "train_runtime": 155.6938, "train_tokens_per_second": 297.61 }, { "epoch": 0.12096774193548387, "grad_norm": 0.9536115527153015, "learning_rate": 0.00019997929378017725, "loss": 1.0273140668869019, "num_input_tokens_seen": 49524, "step": 15, "train_runtime": 162.1537, "train_tokens_per_second": 305.414 }, { "epoch": 0.12903225806451613, "grad_norm": 1.0113959312438965, "learning_rate": 0.00019997379393077428, "loss": 0.9971197247505188, "num_input_tokens_seen": 52878, "step": 16, "train_runtime": 168.7668, "train_tokens_per_second": 313.32 }, { "epoch": 0.13709677419354838, "grad_norm": 1.0217102766036987, "learning_rate": 0.00019996764715959618, "loss": 0.8978726267814636, "num_input_tokens_seen": 56342, "step": 17, "train_runtime": 175.5152, "train_tokens_per_second": 321.009 }, { "epoch": 0.14516129032258066, "grad_norm": 0.9274588227272034, "learning_rate": 0.0001999608535064182, "loss": 0.8967312574386597, "num_input_tokens_seen": 59734, "step": 18, "train_runtime": 182.3718, "train_tokens_per_second": 327.54 }, { "epoch": 0.1532258064516129, "grad_norm": 0.9428430795669556, "learning_rate": 0.0001999534130152014, "loss": 0.8502129912376404, "num_input_tokens_seen": 63022, "step": 19, "train_runtime": 189.0546, "train_tokens_per_second": 333.353 }, { "epoch": 0.16129032258064516, "grad_norm": 1.0413854122161865, "learning_rate": 0.00019994532573409262, "loss": 0.8611021041870117, "num_input_tokens_seen": 66380, "step": 20, "train_runtime": 195.6702, "train_tokens_per_second": 339.244 }, { "epoch": 0.1693548387096774, "grad_norm": 1.1126433610916138, "learning_rate": 0.0001999365917154239, "loss": 0.8235692977905273, "num_input_tokens_seen": 69412, "step": 21, "train_runtime": 201.791, "train_tokens_per_second": 343.98 }, { "epoch": 0.1774193548387097, "grad_norm": 1.0743067264556885, "learning_rate": 0.00019992721101571236, "loss": 0.8540856242179871, "num_input_tokens_seen": 72682, "step": 22, "train_runtime": 208.1813, "train_tokens_per_second": 349.128 }, { "epoch": 0.18548387096774194, "grad_norm": 0.9999114274978638, "learning_rate": 0.0001999171836956597, "loss": 0.8636960983276367, "num_input_tokens_seen": 75880, "step": 23, "train_runtime": 214.6214, "train_tokens_per_second": 353.553 }, { "epoch": 0.1935483870967742, "grad_norm": 0.8346990942955017, "learning_rate": 0.0001999065098201518, "loss": 0.7936219573020935, "num_input_tokens_seen": 79364, "step": 24, "train_runtime": 221.3966, "train_tokens_per_second": 358.47 }, { "epoch": 0.20161290322580644, "grad_norm": 0.9597648978233337, "learning_rate": 0.00019989518945825844, "loss": 0.7347128987312317, "num_input_tokens_seen": 82600, "step": 25, "train_runtime": 227.963, "train_tokens_per_second": 362.34 }, { "epoch": 0.20967741935483872, "grad_norm": 0.9161474704742432, "learning_rate": 0.00019988322268323268, "loss": 0.7401193976402283, "num_input_tokens_seen": 85954, "step": 26, "train_runtime": 234.5417, "train_tokens_per_second": 366.476 }, { "epoch": 0.21774193548387097, "grad_norm": 0.8665032386779785, "learning_rate": 0.00019987060957251047, "loss": 0.7189968228340149, "num_input_tokens_seen": 89314, "step": 27, "train_runtime": 241.1219, "train_tokens_per_second": 370.41 }, { "epoch": 0.22580645161290322, "grad_norm": 0.9274927377700806, "learning_rate": 0.00019985735020771017, "loss": 0.7693567276000977, "num_input_tokens_seen": 92550, "step": 28, "train_runtime": 247.6164, "train_tokens_per_second": 373.764 }, { "epoch": 0.23387096774193547, "grad_norm": 0.9395021796226501, "learning_rate": 0.00019984344467463197, "loss": 0.6852114200592041, "num_input_tokens_seen": 95904, "step": 29, "train_runtime": 254.2268, "train_tokens_per_second": 377.238 }, { "epoch": 0.24193548387096775, "grad_norm": 0.7659223079681396, "learning_rate": 0.0001998288930632574, "loss": 0.7528179883956909, "num_input_tokens_seen": 99314, "step": 30, "train_runtime": 260.9489, "train_tokens_per_second": 380.588 }, { "epoch": 0.24193548387096775, "eval_loss": 2.828965902328491, "eval_runtime": 19.7056, "eval_samples_per_second": 2.639, "eval_steps_per_second": 1.319, "num_input_tokens_seen": 99314, "step": 30 }, { "epoch": 0.25, "grad_norm": 0.8182386755943298, "learning_rate": 0.00019981369546774865, "loss": 0.7148607969284058, "num_input_tokens_seen": 102632, "step": 31, "train_runtime": 292.7626, "train_tokens_per_second": 350.564 }, { "epoch": 0.25806451612903225, "grad_norm": 0.8975741863250732, "learning_rate": 0.00019979785198644806, "loss": 0.7239475250244141, "num_input_tokens_seen": 106032, "step": 32, "train_runtime": 299.4571, "train_tokens_per_second": 354.081 }, { "epoch": 0.2661290322580645, "grad_norm": 0.8582456707954407, "learning_rate": 0.00019978136272187747, "loss": 0.6491944789886475, "num_input_tokens_seen": 109374, "step": 33, "train_runtime": 306.0795, "train_tokens_per_second": 357.339 }, { "epoch": 0.27419354838709675, "grad_norm": 0.8682915568351746, "learning_rate": 0.0001997642277807374, "loss": 0.726476788520813, "num_input_tokens_seen": 112780, "step": 34, "train_runtime": 312.8542, "train_tokens_per_second": 360.487 }, { "epoch": 0.28225806451612906, "grad_norm": 0.7901797294616699, "learning_rate": 0.00019974644727390665, "loss": 0.6442930698394775, "num_input_tokens_seen": 116154, "step": 35, "train_runtime": 319.5272, "train_tokens_per_second": 363.518 }, { "epoch": 0.2903225806451613, "grad_norm": 0.7850049138069153, "learning_rate": 0.00019972802131644127, "loss": 0.6595083475112915, "num_input_tokens_seen": 119350, "step": 36, "train_runtime": 325.9376, "train_tokens_per_second": 366.174 }, { "epoch": 0.29838709677419356, "grad_norm": 0.8906070590019226, "learning_rate": 0.00019970895002757413, "loss": 0.6988556385040283, "num_input_tokens_seen": 122566, "step": 37, "train_runtime": 332.3189, "train_tokens_per_second": 368.82 }, { "epoch": 0.3064516129032258, "grad_norm": 0.7804127335548401, "learning_rate": 0.00019968923353071377, "loss": 0.6648477911949158, "num_input_tokens_seen": 125906, "step": 38, "train_runtime": 339.1293, "train_tokens_per_second": 371.262 }, { "epoch": 0.31451612903225806, "grad_norm": 0.796406090259552, "learning_rate": 0.00019966887195344403, "loss": 0.6134414076805115, "num_input_tokens_seen": 129178, "step": 39, "train_runtime": 345.6909, "train_tokens_per_second": 373.681 }, { "epoch": 0.3225806451612903, "grad_norm": 0.8360472917556763, "learning_rate": 0.0001996478654275229, "loss": 0.6537702679634094, "num_input_tokens_seen": 132242, "step": 40, "train_runtime": 351.9107, "train_tokens_per_second": 375.783 }, { "epoch": 0.33064516129032256, "grad_norm": 0.7299121618270874, "learning_rate": 0.00019962621408888177, "loss": 0.6415868997573853, "num_input_tokens_seen": 135598, "step": 41, "train_runtime": 358.5853, "train_tokens_per_second": 378.147 }, { "epoch": 0.3387096774193548, "grad_norm": 0.7271934747695923, "learning_rate": 0.00019960391807762463, "loss": 0.6368349194526672, "num_input_tokens_seen": 139036, "step": 42, "train_runtime": 365.2471, "train_tokens_per_second": 380.663 }, { "epoch": 0.3467741935483871, "grad_norm": 0.7471262216567993, "learning_rate": 0.00019958097753802693, "loss": 0.6261013746261597, "num_input_tokens_seen": 142314, "step": 43, "train_runtime": 371.8635, "train_tokens_per_second": 382.705 }, { "epoch": 0.3548387096774194, "grad_norm": 0.7209194302558899, "learning_rate": 0.00019955739261853504, "loss": 0.6236422061920166, "num_input_tokens_seen": 145730, "step": 44, "train_runtime": 378.5663, "train_tokens_per_second": 384.952 }, { "epoch": 0.3629032258064516, "grad_norm": 0.7694782018661499, "learning_rate": 0.00019953316347176488, "loss": 0.6035186052322388, "num_input_tokens_seen": 148950, "step": 45, "train_runtime": 384.9456, "train_tokens_per_second": 386.938 }, { "epoch": 0.3709677419354839, "grad_norm": 0.701684832572937, "learning_rate": 0.00019950829025450114, "loss": 0.621403694152832, "num_input_tokens_seen": 152208, "step": 46, "train_runtime": 391.4835, "train_tokens_per_second": 388.798 }, { "epoch": 0.3790322580645161, "grad_norm": 0.6461907625198364, "learning_rate": 0.0001994827731276963, "loss": 0.5377134084701538, "num_input_tokens_seen": 155494, "step": 47, "train_runtime": 398.103, "train_tokens_per_second": 390.587 }, { "epoch": 0.3870967741935484, "grad_norm": 0.7744601368904114, "learning_rate": 0.00019945661225646946, "loss": 0.5798584222793579, "num_input_tokens_seen": 158822, "step": 48, "train_runtime": 404.6934, "train_tokens_per_second": 392.45 }, { "epoch": 0.3951612903225806, "grad_norm": 0.6859694123268127, "learning_rate": 0.0001994298078101054, "loss": 0.584874153137207, "num_input_tokens_seen": 162234, "step": 49, "train_runtime": 411.3749, "train_tokens_per_second": 394.37 }, { "epoch": 0.4032258064516129, "grad_norm": 0.810454249382019, "learning_rate": 0.00019940235996205333, "loss": 0.6668200492858887, "num_input_tokens_seen": 165674, "step": 50, "train_runtime": 418.1504, "train_tokens_per_second": 396.207 }, { "epoch": 0.4112903225806452, "grad_norm": 0.7270381450653076, "learning_rate": 0.0001993742688899259, "loss": 0.5657421350479126, "num_input_tokens_seen": 169012, "step": 51, "train_runtime": 424.8113, "train_tokens_per_second": 397.852 }, { "epoch": 0.41935483870967744, "grad_norm": 0.727074384689331, "learning_rate": 0.00019934553477549794, "loss": 0.573532223701477, "num_input_tokens_seen": 172396, "step": 52, "train_runtime": 431.419, "train_tokens_per_second": 399.602 }, { "epoch": 0.4274193548387097, "grad_norm": 0.7226607203483582, "learning_rate": 0.00019931615780470558, "loss": 0.5972438454627991, "num_input_tokens_seen": 175766, "step": 53, "train_runtime": 438.0204, "train_tokens_per_second": 401.274 }, { "epoch": 0.43548387096774194, "grad_norm": 0.815791130065918, "learning_rate": 0.00019928613816764458, "loss": 0.5655015707015991, "num_input_tokens_seen": 179034, "step": 54, "train_runtime": 444.5345, "train_tokens_per_second": 402.745 }, { "epoch": 0.4435483870967742, "grad_norm": 0.8063026666641235, "learning_rate": 0.00019925547605856934, "loss": 0.6497899293899536, "num_input_tokens_seen": 182380, "step": 55, "train_runtime": 451.1643, "train_tokens_per_second": 404.243 }, { "epoch": 0.45161290322580644, "grad_norm": 0.7375481128692627, "learning_rate": 0.00019922417167589183, "loss": 0.5818440914154053, "num_input_tokens_seen": 185574, "step": 56, "train_runtime": 457.5126, "train_tokens_per_second": 405.615 }, { "epoch": 0.4596774193548387, "grad_norm": 0.711955726146698, "learning_rate": 0.00019919222522217996, "loss": 0.615460991859436, "num_input_tokens_seen": 188970, "step": 57, "train_runtime": 464.1955, "train_tokens_per_second": 407.091 }, { "epoch": 0.46774193548387094, "grad_norm": 0.8263280391693115, "learning_rate": 0.00019915963690415647, "loss": 0.5618603229522705, "num_input_tokens_seen": 192282, "step": 58, "train_runtime": 470.7669, "train_tokens_per_second": 408.444 }, { "epoch": 0.47580645161290325, "grad_norm": 0.7747593522071838, "learning_rate": 0.00019912640693269752, "loss": 0.572489857673645, "num_input_tokens_seen": 195644, "step": 59, "train_runtime": 477.353, "train_tokens_per_second": 409.852 }, { "epoch": 0.4838709677419355, "grad_norm": 0.6882827877998352, "learning_rate": 0.00019909253552283143, "loss": 0.6016375422477722, "num_input_tokens_seen": 198956, "step": 60, "train_runtime": 484.0262, "train_tokens_per_second": 411.044 }, { "epoch": 0.4838709677419355, "eval_loss": 2.4849510192871094, "eval_runtime": 17.0161, "eval_samples_per_second": 3.056, "eval_steps_per_second": 1.528, "num_input_tokens_seen": 198956, "step": 60 }, { "epoch": 0.49193548387096775, "grad_norm": 0.7103594541549683, "learning_rate": 0.00019905802289373715, "loss": 0.5619174838066101, "num_input_tokens_seen": 202314, "step": 61, "train_runtime": 511.5866, "train_tokens_per_second": 395.464 }, { "epoch": 0.5, "grad_norm": 0.6471914052963257, "learning_rate": 0.0001990228692687429, "loss": 0.5455653667449951, "num_input_tokens_seen": 205722, "step": 62, "train_runtime": 518.1512, "train_tokens_per_second": 397.031 }, { "epoch": 0.5080645161290323, "grad_norm": 0.6389148235321045, "learning_rate": 0.00019898707487532474, "loss": 0.5808655619621277, "num_input_tokens_seen": 208882, "step": 63, "train_runtime": 524.5777, "train_tokens_per_second": 398.191 }, { "epoch": 0.5161290322580645, "grad_norm": 0.7594767212867737, "learning_rate": 0.0001989506399451051, "loss": 0.547818124294281, "num_input_tokens_seen": 212198, "step": 64, "train_runtime": 531.1436, "train_tokens_per_second": 399.512 }, { "epoch": 0.5241935483870968, "grad_norm": 0.739295244216919, "learning_rate": 0.0001989135647138513, "loss": 0.5600237846374512, "num_input_tokens_seen": 215586, "step": 65, "train_runtime": 537.8144, "train_tokens_per_second": 400.856 }, { "epoch": 0.532258064516129, "grad_norm": 0.9398884773254395, "learning_rate": 0.00019887584942147394, "loss": 0.6664551496505737, "num_input_tokens_seen": 218732, "step": 66, "train_runtime": 544.2058, "train_tokens_per_second": 401.929 }, { "epoch": 0.5403225806451613, "grad_norm": 0.731681764125824, "learning_rate": 0.0001988374943120254, "loss": 0.5805746912956238, "num_input_tokens_seen": 221938, "step": 67, "train_runtime": 550.7126, "train_tokens_per_second": 403.001 }, { "epoch": 0.5483870967741935, "grad_norm": 0.666251003742218, "learning_rate": 0.00019879849963369827, "loss": 0.5634634494781494, "num_input_tokens_seen": 225362, "step": 68, "train_runtime": 557.4868, "train_tokens_per_second": 404.246 }, { "epoch": 0.5564516129032258, "grad_norm": 0.6615156531333923, "learning_rate": 0.00019875886563882375, "loss": 0.5425825119018555, "num_input_tokens_seen": 228656, "step": 69, "train_runtime": 564.2058, "train_tokens_per_second": 405.271 }, { "epoch": 0.5645161290322581, "grad_norm": 0.7001394033432007, "learning_rate": 0.00019871859258387, "loss": 0.5410492420196533, "num_input_tokens_seen": 232052, "step": 70, "train_runtime": 570.8999, "train_tokens_per_second": 406.467 }, { "epoch": 0.5725806451612904, "grad_norm": 0.7109778523445129, "learning_rate": 0.00019867768072944045, "loss": 0.5101571083068848, "num_input_tokens_seen": 235430, "step": 71, "train_runtime": 577.493, "train_tokens_per_second": 407.676 }, { "epoch": 0.5806451612903226, "grad_norm": 0.606921911239624, "learning_rate": 0.00019863613034027224, "loss": 0.5397766828536987, "num_input_tokens_seen": 238792, "step": 72, "train_runtime": 584.0864, "train_tokens_per_second": 408.83 }, { "epoch": 0.5887096774193549, "grad_norm": 0.7634878754615784, "learning_rate": 0.0001985939416852343, "loss": 0.5851086378097534, "num_input_tokens_seen": 242136, "step": 73, "train_runtime": 590.6152, "train_tokens_per_second": 409.972 }, { "epoch": 0.5967741935483871, "grad_norm": 0.8077980875968933, "learning_rate": 0.00019855111503732574, "loss": 0.5656052231788635, "num_input_tokens_seen": 245502, "step": 74, "train_runtime": 597.2223, "train_tokens_per_second": 411.073 }, { "epoch": 0.6048387096774194, "grad_norm": 0.8043028712272644, "learning_rate": 0.00019850765067367412, "loss": 0.5640852451324463, "num_input_tokens_seen": 248724, "step": 75, "train_runtime": 603.5128, "train_tokens_per_second": 412.127 }, { "epoch": 0.6129032258064516, "grad_norm": 0.7303769588470459, "learning_rate": 0.00019846354887553358, "loss": 0.532991886138916, "num_input_tokens_seen": 252068, "step": 76, "train_runtime": 610.0683, "train_tokens_per_second": 413.18 }, { "epoch": 0.6209677419354839, "grad_norm": 0.7212783098220825, "learning_rate": 0.00019841880992828306, "loss": 0.5575728416442871, "num_input_tokens_seen": 255328, "step": 77, "train_runtime": 616.4056, "train_tokens_per_second": 414.221 }, { "epoch": 0.6290322580645161, "grad_norm": 0.7219402194023132, "learning_rate": 0.0001983734341214244, "loss": 0.5064490437507629, "num_input_tokens_seen": 258440, "step": 78, "train_runtime": 622.5559, "train_tokens_per_second": 415.127 }, { "epoch": 0.6370967741935484, "grad_norm": 0.7140385508537292, "learning_rate": 0.00019832742174858052, "loss": 0.4704212546348572, "num_input_tokens_seen": 261670, "step": 79, "train_runtime": 628.8997, "train_tokens_per_second": 416.076 }, { "epoch": 0.6451612903225806, "grad_norm": 0.752485990524292, "learning_rate": 0.0001982807731074935, "loss": 0.5312367677688599, "num_input_tokens_seen": 265042, "step": 80, "train_runtime": 635.5329, "train_tokens_per_second": 417.039 }, { "epoch": 0.6532258064516129, "grad_norm": 0.7832441926002502, "learning_rate": 0.00019823348850002268, "loss": 0.5208355188369751, "num_input_tokens_seen": 268396, "step": 81, "train_runtime": 642.0933, "train_tokens_per_second": 418.002 }, { "epoch": 0.6612903225806451, "grad_norm": 0.8507868051528931, "learning_rate": 0.00019818556823214268, "loss": 0.5739681124687195, "num_input_tokens_seen": 271554, "step": 82, "train_runtime": 648.5667, "train_tokens_per_second": 418.699 }, { "epoch": 0.6693548387096774, "grad_norm": 0.8015562295913696, "learning_rate": 0.00019813701261394136, "loss": 0.564713180065155, "num_input_tokens_seen": 274730, "step": 83, "train_runtime": 654.8764, "train_tokens_per_second": 419.514 }, { "epoch": 0.6774193548387096, "grad_norm": 0.6441566348075867, "learning_rate": 0.00019808782195961797, "loss": 0.5123165845870972, "num_input_tokens_seen": 278100, "step": 84, "train_runtime": 661.5162, "train_tokens_per_second": 420.398 }, { "epoch": 0.6854838709677419, "grad_norm": 0.6195685267448425, "learning_rate": 0.00019803799658748094, "loss": 0.4935515522956848, "num_input_tokens_seen": 281468, "step": 85, "train_runtime": 668.1642, "train_tokens_per_second": 421.256 }, { "epoch": 0.6935483870967742, "grad_norm": 0.6841112375259399, "learning_rate": 0.000197987536819946, "loss": 0.5896198153495789, "num_input_tokens_seen": 284944, "step": 86, "train_runtime": 675.0131, "train_tokens_per_second": 422.131 }, { "epoch": 0.7016129032258065, "grad_norm": 0.661755383014679, "learning_rate": 0.0001979364429835339, "loss": 0.4884531795978546, "num_input_tokens_seen": 288252, "step": 87, "train_runtime": 681.6095, "train_tokens_per_second": 422.899 }, { "epoch": 0.7096774193548387, "grad_norm": 0.5935579538345337, "learning_rate": 0.00019788471540886844, "loss": 0.47178950905799866, "num_input_tokens_seen": 291674, "step": 88, "train_runtime": 688.3131, "train_tokens_per_second": 423.752 }, { "epoch": 0.717741935483871, "grad_norm": 0.6006376147270203, "learning_rate": 0.0001978323544306743, "loss": 0.48318639397621155, "num_input_tokens_seen": 295054, "step": 89, "train_runtime": 695.0099, "train_tokens_per_second": 424.532 }, { "epoch": 0.7258064516129032, "grad_norm": 0.8534025549888611, "learning_rate": 0.00019777936038777483, "loss": 0.5595048666000366, "num_input_tokens_seen": 298162, "step": 90, "train_runtime": 701.2865, "train_tokens_per_second": 425.164 }, { "epoch": 0.7258064516129032, "eval_loss": 2.394501209259033, "eval_runtime": 16.981, "eval_samples_per_second": 3.062, "eval_steps_per_second": 1.531, "num_input_tokens_seen": 298162, "step": 90 }, { "epoch": 0.7338709677419355, "grad_norm": 0.6207215189933777, "learning_rate": 0.0001977257336230899, "loss": 0.4640101492404938, "num_input_tokens_seen": 301500, "step": 91, "train_runtime": 729.0133, "train_tokens_per_second": 413.573 }, { "epoch": 0.7419354838709677, "grad_norm": 0.7542288899421692, "learning_rate": 0.00019767147448363366, "loss": 0.49243515729904175, "num_input_tokens_seen": 304624, "step": 92, "train_runtime": 735.2559, "train_tokens_per_second": 414.31 }, { "epoch": 0.75, "grad_norm": 0.7016634941101074, "learning_rate": 0.00019761658332051235, "loss": 0.5259037017822266, "num_input_tokens_seen": 307956, "step": 93, "train_runtime": 741.7904, "train_tokens_per_second": 415.152 }, { "epoch": 0.7580645161290323, "grad_norm": 0.6728646159172058, "learning_rate": 0.00019756106048892186, "loss": 0.5248728394508362, "num_input_tokens_seen": 311354, "step": 94, "train_runtime": 748.457, "train_tokens_per_second": 415.995 }, { "epoch": 0.7661290322580645, "grad_norm": 0.723591685295105, "learning_rate": 0.00019750490634814572, "loss": 0.5338126420974731, "num_input_tokens_seen": 314592, "step": 95, "train_runtime": 754.9632, "train_tokens_per_second": 416.698 }, { "epoch": 0.7741935483870968, "grad_norm": 0.7161297798156738, "learning_rate": 0.00019744812126155245, "loss": 0.5010942220687866, "num_input_tokens_seen": 317960, "step": 96, "train_runtime": 761.6717, "train_tokens_per_second": 417.45 }, { "epoch": 0.782258064516129, "grad_norm": 0.6660385131835938, "learning_rate": 0.00019739070559659347, "loss": 0.526728093624115, "num_input_tokens_seen": 321274, "step": 97, "train_runtime": 768.2895, "train_tokens_per_second": 418.168 }, { "epoch": 0.7903225806451613, "grad_norm": 0.724772572517395, "learning_rate": 0.0001973326597248006, "loss": 0.548335611820221, "num_input_tokens_seen": 324480, "step": 98, "train_runtime": 774.6668, "train_tokens_per_second": 418.864 }, { "epoch": 0.7983870967741935, "grad_norm": 0.6925802230834961, "learning_rate": 0.0001972739840217836, "loss": 0.4738383889198303, "num_input_tokens_seen": 327678, "step": 99, "train_runtime": 781.1283, "train_tokens_per_second": 419.493 }, { "epoch": 0.8064516129032258, "grad_norm": 0.6755756139755249, "learning_rate": 0.00019721467886722792, "loss": 0.5126165151596069, "num_input_tokens_seen": 330914, "step": 100, "train_runtime": 787.5258, "train_tokens_per_second": 420.195 }, { "epoch": 0.8145161290322581, "grad_norm": 0.7228033542633057, "learning_rate": 0.00019715474464489208, "loss": 0.51287442445755, "num_input_tokens_seen": 334256, "step": 101, "train_runtime": 794.2459, "train_tokens_per_second": 420.847 }, { "epoch": 0.8225806451612904, "grad_norm": 0.636013925075531, "learning_rate": 0.0001970941817426052, "loss": 0.5042312741279602, "num_input_tokens_seen": 337722, "step": 102, "train_runtime": 800.8987, "train_tokens_per_second": 421.679 }, { "epoch": 0.8306451612903226, "grad_norm": 0.64854496717453, "learning_rate": 0.00019703299055226468, "loss": 0.49396568536758423, "num_input_tokens_seen": 341038, "step": 103, "train_runtime": 807.5236, "train_tokens_per_second": 422.326 }, { "epoch": 0.8387096774193549, "grad_norm": 0.7666099667549133, "learning_rate": 0.00019697117146983334, "loss": 0.5051327347755432, "num_input_tokens_seen": 344066, "step": 104, "train_runtime": 813.5636, "train_tokens_per_second": 422.912 }, { "epoch": 0.8467741935483871, "grad_norm": 0.7877888679504395, "learning_rate": 0.0001969087248953371, "loss": 0.5758072137832642, "num_input_tokens_seen": 347352, "step": 105, "train_runtime": 820.2344, "train_tokens_per_second": 423.479 }, { "epoch": 0.8548387096774194, "grad_norm": 0.6786578893661499, "learning_rate": 0.00019684565123286244, "loss": 0.4985409080982208, "num_input_tokens_seen": 350654, "step": 106, "train_runtime": 826.7619, "train_tokens_per_second": 424.129 }, { "epoch": 0.8629032258064516, "grad_norm": 0.7095152735710144, "learning_rate": 0.00019678195089055346, "loss": 0.5382249355316162, "num_input_tokens_seen": 353954, "step": 107, "train_runtime": 833.2825, "train_tokens_per_second": 424.771 }, { "epoch": 0.8709677419354839, "grad_norm": 0.6485514044761658, "learning_rate": 0.00019671762428060966, "loss": 0.5011004209518433, "num_input_tokens_seen": 357348, "step": 108, "train_runtime": 839.895, "train_tokens_per_second": 425.467 }, { "epoch": 0.8790322580645161, "grad_norm": 0.6399499773979187, "learning_rate": 0.00019665267181928292, "loss": 0.5171963572502136, "num_input_tokens_seen": 360628, "step": 109, "train_runtime": 846.4457, "train_tokens_per_second": 426.05 }, { "epoch": 0.8870967741935484, "grad_norm": 0.6398491263389587, "learning_rate": 0.00019658709392687506, "loss": 0.5169506669044495, "num_input_tokens_seen": 364030, "step": 110, "train_runtime": 853.0569, "train_tokens_per_second": 426.736 }, { "epoch": 0.8951612903225806, "grad_norm": 0.6788223385810852, "learning_rate": 0.00019652089102773488, "loss": 0.5841289758682251, "num_input_tokens_seen": 367360, "step": 111, "train_runtime": 859.6357, "train_tokens_per_second": 427.344 }, { "epoch": 0.9032258064516129, "grad_norm": 0.6092165112495422, "learning_rate": 0.00019645406355025565, "loss": 0.4721071720123291, "num_input_tokens_seen": 370518, "step": 112, "train_runtime": 865.9187, "train_tokens_per_second": 427.89 }, { "epoch": 0.9112903225806451, "grad_norm": 0.5953733325004578, "learning_rate": 0.00019638661192687216, "loss": 0.4920331835746765, "num_input_tokens_seen": 373828, "step": 113, "train_runtime": 872.4705, "train_tokens_per_second": 428.471 }, { "epoch": 0.9193548387096774, "grad_norm": 0.6158846020698547, "learning_rate": 0.00019631853659405807, "loss": 0.5007290840148926, "num_input_tokens_seen": 377226, "step": 114, "train_runtime": 879.1582, "train_tokens_per_second": 429.076 }, { "epoch": 0.9274193548387096, "grad_norm": 0.5616441369056702, "learning_rate": 0.000196249837992323, "loss": 0.4844634234905243, "num_input_tokens_seen": 380624, "step": 115, "train_runtime": 885.8971, "train_tokens_per_second": 429.648 }, { "epoch": 0.9354838709677419, "grad_norm": 0.616858184337616, "learning_rate": 0.0001961805165662096, "loss": 0.4595881402492523, "num_input_tokens_seen": 384012, "step": 116, "train_runtime": 892.5519, "train_tokens_per_second": 430.241 }, { "epoch": 0.9435483870967742, "grad_norm": 0.6438707113265991, "learning_rate": 0.00019611057276429085, "loss": 0.5118931531906128, "num_input_tokens_seen": 387364, "step": 117, "train_runtime": 899.1551, "train_tokens_per_second": 430.809 }, { "epoch": 0.9516129032258065, "grad_norm": 0.6319488883018494, "learning_rate": 0.00019604000703916705, "loss": 0.479214072227478, "num_input_tokens_seen": 390740, "step": 118, "train_runtime": 905.8859, "train_tokens_per_second": 431.335 }, { "epoch": 0.9596774193548387, "grad_norm": 0.7468289136886597, "learning_rate": 0.00019596881984746287, "loss": 0.49020278453826904, "num_input_tokens_seen": 394002, "step": 119, "train_runtime": 912.4634, "train_tokens_per_second": 431.8 }, { "epoch": 0.967741935483871, "grad_norm": 0.795864462852478, "learning_rate": 0.00019589701164982452, "loss": 0.49333739280700684, "num_input_tokens_seen": 397036, "step": 120, "train_runtime": 918.6599, "train_tokens_per_second": 432.19 }, { "epoch": 0.967741935483871, "eval_loss": 2.3108139038085938, "eval_runtime": 17.0317, "eval_samples_per_second": 3.053, "eval_steps_per_second": 1.527, "num_input_tokens_seen": 397036, "step": 120 }, { "epoch": 0.9758064516129032, "grad_norm": 0.7513642907142639, "learning_rate": 0.00019582458291091663, "loss": 0.5653488636016846, "num_input_tokens_seen": 400216, "step": 121, "train_runtime": 946.0725, "train_tokens_per_second": 423.029 }, { "epoch": 0.9838709677419355, "grad_norm": 0.6150530576705933, "learning_rate": 0.0001957515340994193, "loss": 0.48984086513519287, "num_input_tokens_seen": 403522, "step": 122, "train_runtime": 952.5907, "train_tokens_per_second": 423.605 }, { "epoch": 0.9919354838709677, "grad_norm": 0.6230205297470093, "learning_rate": 0.000195677865688025, "loss": 0.52336186170578, "num_input_tokens_seen": 406874, "step": 123, "train_runtime": 959.2256, "train_tokens_per_second": 424.169 }, { "epoch": 1.0, "grad_norm": 0.684869110584259, "learning_rate": 0.00019560357815343577, "loss": 0.4886379539966583, "num_input_tokens_seen": 410114, "step": 124, "train_runtime": 965.6844, "train_tokens_per_second": 424.687 }, { "epoch": 1.0080645161290323, "grad_norm": 0.597822904586792, "learning_rate": 0.00019552867197635974, "loss": 0.41038113832473755, "num_input_tokens_seen": 413264, "step": 125, "train_runtime": 972.0355, "train_tokens_per_second": 425.153 }, { "epoch": 1.0161290322580645, "grad_norm": 0.6053985357284546, "learning_rate": 0.00019545314764150837, "loss": 0.3999386131763458, "num_input_tokens_seen": 416432, "step": 126, "train_runtime": 978.3998, "train_tokens_per_second": 425.626 }, { "epoch": 1.0241935483870968, "grad_norm": 0.5770310759544373, "learning_rate": 0.00019537700563759304, "loss": 0.43007850646972656, "num_input_tokens_seen": 419814, "step": 127, "train_runtime": 985.1414, "train_tokens_per_second": 426.146 }, { "epoch": 1.032258064516129, "grad_norm": 0.6475788354873657, "learning_rate": 0.00019530024645732206, "loss": 0.4416751563549042, "num_input_tokens_seen": 423168, "step": 128, "train_runtime": 991.8066, "train_tokens_per_second": 426.664 }, { "epoch": 1.0403225806451613, "grad_norm": 0.6236942410469055, "learning_rate": 0.00019522287059739753, "loss": 0.4056033790111542, "num_input_tokens_seen": 426248, "step": 129, "train_runtime": 997.974, "train_tokens_per_second": 427.113 }, { "epoch": 1.0483870967741935, "grad_norm": 0.6982079148292542, "learning_rate": 0.00019514487855851184, "loss": 0.4574550986289978, "num_input_tokens_seen": 429726, "step": 130, "train_runtime": 1004.7068, "train_tokens_per_second": 427.713 }, { "epoch": 1.0564516129032258, "grad_norm": 0.6114121079444885, "learning_rate": 0.00019506627084534483, "loss": 0.4151981472969055, "num_input_tokens_seen": 432912, "step": 131, "train_runtime": 1011.1009, "train_tokens_per_second": 428.159 }, { "epoch": 1.064516129032258, "grad_norm": 0.6755998730659485, "learning_rate": 0.00019498704796656018, "loss": 0.4350915551185608, "num_input_tokens_seen": 436246, "step": 132, "train_runtime": 1017.7642, "train_tokens_per_second": 428.632 }, { "epoch": 1.0725806451612903, "grad_norm": 0.7091481685638428, "learning_rate": 0.00019490721043480226, "loss": 0.44581809639930725, "num_input_tokens_seen": 439446, "step": 133, "train_runtime": 1024.1345, "train_tokens_per_second": 429.09 }, { "epoch": 1.0806451612903225, "grad_norm": 0.6453033685684204, "learning_rate": 0.00019482675876669286, "loss": 0.41440466046333313, "num_input_tokens_seen": 442716, "step": 134, "train_runtime": 1030.6631, "train_tokens_per_second": 429.545 }, { "epoch": 1.0887096774193548, "grad_norm": 0.6018978357315063, "learning_rate": 0.00019474569348282774, "loss": 0.41350579261779785, "num_input_tokens_seen": 445948, "step": 135, "train_runtime": 1037.0128, "train_tokens_per_second": 430.031 }, { "epoch": 1.096774193548387, "grad_norm": 0.7258074879646301, "learning_rate": 0.0001946640151077734, "loss": 0.44569528102874756, "num_input_tokens_seen": 449172, "step": 136, "train_runtime": 1043.4434, "train_tokens_per_second": 430.471 }, { "epoch": 1.1048387096774193, "grad_norm": 0.7976507544517517, "learning_rate": 0.00019458172417006347, "loss": 0.5195361375808716, "num_input_tokens_seen": 452512, "step": 137, "train_runtime": 1050.0077, "train_tokens_per_second": 430.961 }, { "epoch": 1.1129032258064515, "grad_norm": 0.7825067639350891, "learning_rate": 0.00019449882120219555, "loss": 0.44283729791641235, "num_input_tokens_seen": 455922, "step": 138, "train_runtime": 1056.6883, "train_tokens_per_second": 431.463 }, { "epoch": 1.120967741935484, "grad_norm": 0.66150963306427, "learning_rate": 0.00019441530674062753, "loss": 0.42789438366889954, "num_input_tokens_seen": 459246, "step": 139, "train_runtime": 1063.2307, "train_tokens_per_second": 431.934 }, { "epoch": 1.129032258064516, "grad_norm": 0.645460307598114, "learning_rate": 0.0001943311813257743, "loss": 0.38796648383140564, "num_input_tokens_seen": 462644, "step": 140, "train_runtime": 1069.8507, "train_tokens_per_second": 432.438 }, { "epoch": 1.1370967741935485, "grad_norm": 0.6324282884597778, "learning_rate": 0.00019424644550200415, "loss": 0.43011078238487244, "num_input_tokens_seen": 466004, "step": 141, "train_runtime": 1076.4369, "train_tokens_per_second": 432.913 }, { "epoch": 1.1451612903225807, "grad_norm": 0.6442665457725525, "learning_rate": 0.00019416109981763526, "loss": 0.4545246958732605, "num_input_tokens_seen": 469474, "step": 142, "train_runtime": 1083.2686, "train_tokens_per_second": 433.386 }, { "epoch": 1.153225806451613, "grad_norm": 0.6576758623123169, "learning_rate": 0.00019407514482493214, "loss": 0.4253957271575928, "num_input_tokens_seen": 472830, "step": 143, "train_runtime": 1089.8639, "train_tokens_per_second": 433.843 }, { "epoch": 1.1612903225806452, "grad_norm": 0.7580392956733704, "learning_rate": 0.00019398858108010217, "loss": 0.45867177844047546, "num_input_tokens_seen": 476146, "step": 144, "train_runtime": 1096.4187, "train_tokens_per_second": 434.274 }, { "epoch": 1.1693548387096775, "grad_norm": 0.7474371790885925, "learning_rate": 0.0001939014091432918, "loss": 0.46455514430999756, "num_input_tokens_seen": 479388, "step": 145, "train_runtime": 1102.8261, "train_tokens_per_second": 434.69 }, { "epoch": 1.1774193548387097, "grad_norm": 0.636562168598175, "learning_rate": 0.00019381362957858312, "loss": 0.4370546340942383, "num_input_tokens_seen": 482820, "step": 146, "train_runtime": 1109.5311, "train_tokens_per_second": 435.157 }, { "epoch": 1.185483870967742, "grad_norm": 0.6858648061752319, "learning_rate": 0.00019372524295399013, "loss": 0.3828548491001129, "num_input_tokens_seen": 485956, "step": 147, "train_runtime": 1115.864, "train_tokens_per_second": 435.498 }, { "epoch": 1.1935483870967742, "grad_norm": 0.7505737543106079, "learning_rate": 0.00019363624984145502, "loss": 0.43668264150619507, "num_input_tokens_seen": 489266, "step": 148, "train_runtime": 1122.4445, "train_tokens_per_second": 435.893 }, { "epoch": 1.2016129032258065, "grad_norm": 0.6092221140861511, "learning_rate": 0.00019354665081684446, "loss": 0.36623674631118774, "num_input_tokens_seen": 492712, "step": 149, "train_runtime": 1129.2289, "train_tokens_per_second": 436.326 }, { "epoch": 1.2096774193548387, "grad_norm": 0.6618177890777588, "learning_rate": 0.0001934564464599461, "loss": 0.36393943428993225, "num_input_tokens_seen": 496182, "step": 150, "train_runtime": 1135.9402, "train_tokens_per_second": 436.803 }, { "epoch": 1.2096774193548387, "eval_loss": 2.468383550643921, "eval_runtime": 16.9717, "eval_samples_per_second": 3.064, "eval_steps_per_second": 1.532, "num_input_tokens_seen": 496182, "step": 150 }, { "epoch": 1.217741935483871, "grad_norm": 0.7288278341293335, "learning_rate": 0.00019336563735446446, "loss": 0.47049954533576965, "num_input_tokens_seen": 499482, "step": 151, "train_runtime": 1163.8126, "train_tokens_per_second": 429.177 }, { "epoch": 1.2258064516129032, "grad_norm": 0.6693354845046997, "learning_rate": 0.00019327422408801744, "loss": 0.4141227900981903, "num_input_tokens_seen": 502832, "step": 152, "train_runtime": 1170.3921, "train_tokens_per_second": 429.627 }, { "epoch": 1.2338709677419355, "grad_norm": 0.7173429727554321, "learning_rate": 0.0001931822072521323, "loss": 0.4298741817474365, "num_input_tokens_seen": 506196, "step": 153, "train_runtime": 1176.9716, "train_tokens_per_second": 430.083 }, { "epoch": 1.2419354838709677, "grad_norm": 0.7945153117179871, "learning_rate": 0.00019308958744224217, "loss": 0.4285031855106354, "num_input_tokens_seen": 509348, "step": 154, "train_runtime": 1183.3919, "train_tokens_per_second": 430.414 }, { "epoch": 1.25, "grad_norm": 0.6567854285240173, "learning_rate": 0.00019299636525768173, "loss": 0.40128999948501587, "num_input_tokens_seen": 512728, "step": 155, "train_runtime": 1190.0997, "train_tokens_per_second": 430.828 }, { "epoch": 1.2580645161290323, "grad_norm": 0.6827821135520935, "learning_rate": 0.00019290254130168374, "loss": 0.3844904899597168, "num_input_tokens_seen": 516078, "step": 156, "train_runtime": 1196.7623, "train_tokens_per_second": 431.229 }, { "epoch": 1.2661290322580645, "grad_norm": 0.6366832256317139, "learning_rate": 0.00019280811618137484, "loss": 0.42982959747314453, "num_input_tokens_seen": 519482, "step": 157, "train_runtime": 1203.4902, "train_tokens_per_second": 431.646 }, { "epoch": 1.2741935483870968, "grad_norm": 0.6760056614875793, "learning_rate": 0.00019271309050777183, "loss": 0.4195765256881714, "num_input_tokens_seen": 522758, "step": 158, "train_runtime": 1209.9586, "train_tokens_per_second": 432.046 }, { "epoch": 1.282258064516129, "grad_norm": 0.7459661960601807, "learning_rate": 0.00019261746489577765, "loss": 0.4203264117240906, "num_input_tokens_seen": 525784, "step": 159, "train_runtime": 1216.0785, "train_tokens_per_second": 432.36 }, { "epoch": 1.2903225806451613, "grad_norm": 0.5943926572799683, "learning_rate": 0.00019252123996417738, "loss": 0.3937755525112152, "num_input_tokens_seen": 529130, "step": 160, "train_runtime": 1222.6742, "train_tokens_per_second": 432.765 }, { "epoch": 1.2983870967741935, "grad_norm": 0.6527149677276611, "learning_rate": 0.00019242441633563417, "loss": 0.4311143755912781, "num_input_tokens_seen": 532308, "step": 161, "train_runtime": 1229.0914, "train_tokens_per_second": 433.091 }, { "epoch": 1.3064516129032258, "grad_norm": 0.6006942987442017, "learning_rate": 0.00019232699463668542, "loss": 0.4009702205657959, "num_input_tokens_seen": 535662, "step": 162, "train_runtime": 1235.7182, "train_tokens_per_second": 433.482 }, { "epoch": 1.314516129032258, "grad_norm": 0.708247721195221, "learning_rate": 0.00019222897549773848, "loss": 0.4113525152206421, "num_input_tokens_seen": 539056, "step": 163, "train_runtime": 1242.3294, "train_tokens_per_second": 433.907 }, { "epoch": 1.3225806451612903, "grad_norm": 0.5651199221611023, "learning_rate": 0.0001921303595530667, "loss": 0.36679160594940186, "num_input_tokens_seen": 542450, "step": 164, "train_runtime": 1248.9434, "train_tokens_per_second": 434.327 }, { "epoch": 1.3306451612903225, "grad_norm": 0.7649331092834473, "learning_rate": 0.00019203114744080542, "loss": 0.449223130941391, "num_input_tokens_seen": 545796, "step": 165, "train_runtime": 1255.5695, "train_tokens_per_second": 434.7 }, { "epoch": 1.3387096774193548, "grad_norm": 0.7255364656448364, "learning_rate": 0.0001919313398029475, "loss": 0.4195847511291504, "num_input_tokens_seen": 548960, "step": 166, "train_runtime": 1261.8368, "train_tokens_per_second": 435.048 }, { "epoch": 1.346774193548387, "grad_norm": 0.7223755717277527, "learning_rate": 0.00019183093728533966, "loss": 0.41058725118637085, "num_input_tokens_seen": 552310, "step": 167, "train_runtime": 1268.4105, "train_tokens_per_second": 435.435 }, { "epoch": 1.3548387096774195, "grad_norm": 0.6943632364273071, "learning_rate": 0.00019172994053767784, "loss": 0.41625842452049255, "num_input_tokens_seen": 555660, "step": 168, "train_runtime": 1274.9504, "train_tokens_per_second": 435.829 }, { "epoch": 1.3629032258064515, "grad_norm": 0.7306869626045227, "learning_rate": 0.0001916283502135033, "loss": 0.4193657636642456, "num_input_tokens_seen": 558942, "step": 169, "train_runtime": 1281.4937, "train_tokens_per_second": 436.164 }, { "epoch": 1.370967741935484, "grad_norm": 0.9073975682258606, "learning_rate": 0.00019152616697019822, "loss": 0.48369550704956055, "num_input_tokens_seen": 561914, "step": 170, "train_runtime": 1287.5106, "train_tokens_per_second": 436.434 }, { "epoch": 1.379032258064516, "grad_norm": 0.6838721632957458, "learning_rate": 0.0001914233914689815, "loss": 0.44988489151000977, "num_input_tokens_seen": 565128, "step": 171, "train_runtime": 1293.994, "train_tokens_per_second": 436.732 }, { "epoch": 1.3870967741935485, "grad_norm": 0.7812486886978149, "learning_rate": 0.00019132002437490458, "loss": 0.40069717168807983, "num_input_tokens_seen": 568492, "step": 172, "train_runtime": 1300.6183, "train_tokens_per_second": 437.094 }, { "epoch": 1.3951612903225805, "grad_norm": 0.7606605291366577, "learning_rate": 0.00019121606635684696, "loss": 0.46690768003463745, "num_input_tokens_seen": 571838, "step": 173, "train_runtime": 1307.2362, "train_tokens_per_second": 437.44 }, { "epoch": 1.403225806451613, "grad_norm": 0.6010671854019165, "learning_rate": 0.00019111151808751196, "loss": 0.36060816049575806, "num_input_tokens_seen": 575172, "step": 174, "train_runtime": 1313.8279, "train_tokens_per_second": 437.783 }, { "epoch": 1.4112903225806452, "grad_norm": 0.7686548829078674, "learning_rate": 0.00019100638024342244, "loss": 0.42185837030410767, "num_input_tokens_seen": 578472, "step": 175, "train_runtime": 1320.4302, "train_tokens_per_second": 438.094 }, { "epoch": 1.4193548387096775, "grad_norm": 0.7026260495185852, "learning_rate": 0.00019090065350491626, "loss": 0.4178864061832428, "num_input_tokens_seen": 581854, "step": 176, "train_runtime": 1327.0726, "train_tokens_per_second": 438.449 }, { "epoch": 1.4274193548387097, "grad_norm": 0.64774090051651, "learning_rate": 0.00019079433855614201, "loss": 0.40213117003440857, "num_input_tokens_seen": 585240, "step": 177, "train_runtime": 1333.7896, "train_tokens_per_second": 438.78 }, { "epoch": 1.435483870967742, "grad_norm": 0.7670486569404602, "learning_rate": 0.00019068743608505455, "loss": 0.4300892651081085, "num_input_tokens_seen": 588466, "step": 178, "train_runtime": 1340.1622, "train_tokens_per_second": 439.101 }, { "epoch": 1.4435483870967742, "grad_norm": 0.7370812296867371, "learning_rate": 0.0001905799467834105, "loss": 0.4132138788700104, "num_input_tokens_seen": 591716, "step": 179, "train_runtime": 1346.5699, "train_tokens_per_second": 439.425 }, { "epoch": 1.4516129032258065, "grad_norm": 0.6373615860939026, "learning_rate": 0.00019047187134676387, "loss": 0.4021753668785095, "num_input_tokens_seen": 595124, "step": 180, "train_runtime": 1353.3295, "train_tokens_per_second": 439.748 }, { "epoch": 1.4516129032258065, "eval_loss": 2.275111675262451, "eval_runtime": 17.0147, "eval_samples_per_second": 3.056, "eval_steps_per_second": 1.528, "num_input_tokens_seen": 595124, "step": 180 }, { "epoch": 1.4596774193548387, "grad_norm": 0.6791172623634338, "learning_rate": 0.0001903632104744614, "loss": 0.4088537395000458, "num_input_tokens_seen": 598560, "step": 181, "train_runtime": 1383.6055, "train_tokens_per_second": 432.609 }, { "epoch": 1.467741935483871, "grad_norm": 0.8019388318061829, "learning_rate": 0.00019025396486963827, "loss": 0.43525320291519165, "num_input_tokens_seen": 601832, "step": 182, "train_runtime": 1390.1876, "train_tokens_per_second": 432.914 }, { "epoch": 1.4758064516129032, "grad_norm": 0.7230949997901917, "learning_rate": 0.0001901441352392133, "loss": 0.4377875030040741, "num_input_tokens_seen": 605124, "step": 183, "train_runtime": 1396.7627, "train_tokens_per_second": 433.233 }, { "epoch": 1.4838709677419355, "grad_norm": 0.7066268920898438, "learning_rate": 0.00019003372229388452, "loss": 0.4358859956264496, "num_input_tokens_seen": 608550, "step": 184, "train_runtime": 1403.4877, "train_tokens_per_second": 433.598 }, { "epoch": 1.4919354838709677, "grad_norm": 0.6805543899536133, "learning_rate": 0.0001899227267481246, "loss": 0.4511941969394684, "num_input_tokens_seen": 611912, "step": 185, "train_runtime": 1410.0797, "train_tokens_per_second": 433.956 }, { "epoch": 1.5, "grad_norm": 0.7089034914970398, "learning_rate": 0.00018981114932017609, "loss": 0.4455965757369995, "num_input_tokens_seen": 615114, "step": 186, "train_runtime": 1416.5446, "train_tokens_per_second": 434.236 }, { "epoch": 1.5080645161290323, "grad_norm": 0.6060784459114075, "learning_rate": 0.00018969899073204686, "loss": 0.40845364332199097, "num_input_tokens_seen": 618532, "step": 187, "train_runtime": 1423.1876, "train_tokens_per_second": 434.61 }, { "epoch": 1.5161290322580645, "grad_norm": 0.6854289174079895, "learning_rate": 0.00018958625170950545, "loss": 0.43267232179641724, "num_input_tokens_seen": 621584, "step": 188, "train_runtime": 1429.4206, "train_tokens_per_second": 434.85 }, { "epoch": 1.5241935483870968, "grad_norm": 0.641149640083313, "learning_rate": 0.00018947293298207635, "loss": 0.44395285844802856, "num_input_tokens_seen": 624890, "step": 189, "train_runtime": 1435.9579, "train_tokens_per_second": 435.173 }, { "epoch": 1.532258064516129, "grad_norm": 0.761741578578949, "learning_rate": 0.00018935903528303523, "loss": 0.44854938983917236, "num_input_tokens_seen": 627898, "step": 190, "train_runtime": 1442.0928, "train_tokens_per_second": 435.407 }, { "epoch": 1.5403225806451613, "grad_norm": 0.6175191402435303, "learning_rate": 0.0001892445593494042, "loss": 0.38872429728507996, "num_input_tokens_seen": 631260, "step": 191, "train_runtime": 1448.6619, "train_tokens_per_second": 435.754 }, { "epoch": 1.5483870967741935, "grad_norm": 0.6766597032546997, "learning_rate": 0.0001891295059219472, "loss": 0.39698436856269836, "num_input_tokens_seen": 634598, "step": 192, "train_runtime": 1455.2728, "train_tokens_per_second": 436.068 }, { "epoch": 1.5564516129032258, "grad_norm": 0.7066875100135803, "learning_rate": 0.00018901387574516497, "loss": 0.4259159564971924, "num_input_tokens_seen": 637776, "step": 193, "train_runtime": 1461.5701, "train_tokens_per_second": 436.364 }, { "epoch": 1.564516129032258, "grad_norm": 0.7477712035179138, "learning_rate": 0.00018889766956729044, "loss": 0.41065987944602966, "num_input_tokens_seen": 641100, "step": 194, "train_runtime": 1468.1552, "train_tokens_per_second": 436.67 }, { "epoch": 1.5725806451612905, "grad_norm": 0.7463842630386353, "learning_rate": 0.00018878088814028364, "loss": 0.42325031757354736, "num_input_tokens_seen": 644472, "step": 195, "train_runtime": 1474.7266, "train_tokens_per_second": 437.011 }, { "epoch": 1.5806451612903225, "grad_norm": 0.6973960399627686, "learning_rate": 0.00018866353221982718, "loss": 0.36401230096817017, "num_input_tokens_seen": 647738, "step": 196, "train_runtime": 1481.2508, "train_tokens_per_second": 437.291 }, { "epoch": 1.588709677419355, "grad_norm": 0.7673096656799316, "learning_rate": 0.000188545602565321, "loss": 0.4327010214328766, "num_input_tokens_seen": 651030, "step": 197, "train_runtime": 1487.7683, "train_tokens_per_second": 437.588 }, { "epoch": 1.596774193548387, "grad_norm": 0.7988507151603699, "learning_rate": 0.00018842709993987776, "loss": 0.4233095347881317, "num_input_tokens_seen": 654362, "step": 198, "train_runtime": 1494.4203, "train_tokens_per_second": 437.87 }, { "epoch": 1.6048387096774195, "grad_norm": 0.764127254486084, "learning_rate": 0.00018830802511031762, "loss": 0.3952127695083618, "num_input_tokens_seen": 657668, "step": 199, "train_runtime": 1500.9816, "train_tokens_per_second": 438.159 }, { "epoch": 1.6129032258064515, "grad_norm": 0.7098085284233093, "learning_rate": 0.0001881883788471636, "loss": 0.40977275371551514, "num_input_tokens_seen": 661006, "step": 200, "train_runtime": 1507.5559, "train_tokens_per_second": 438.462 }, { "epoch": 1.620967741935484, "grad_norm": 0.7413765788078308, "learning_rate": 0.00018806816192463625, "loss": 0.42479950189590454, "num_input_tokens_seen": 664466, "step": 201, "train_runtime": 1514.302, "train_tokens_per_second": 438.794 }, { "epoch": 1.629032258064516, "grad_norm": 0.8011177778244019, "learning_rate": 0.0001879473751206489, "loss": 0.4388383626937866, "num_input_tokens_seen": 667634, "step": 202, "train_runtime": 1520.6629, "train_tokens_per_second": 439.041 }, { "epoch": 1.6370967741935485, "grad_norm": 0.6176231503486633, "learning_rate": 0.00018782601921680256, "loss": 0.4233701825141907, "num_input_tokens_seen": 671080, "step": 203, "train_runtime": 1527.319, "train_tokens_per_second": 439.384 }, { "epoch": 1.6451612903225805, "grad_norm": 0.6928187012672424, "learning_rate": 0.00018770409499838073, "loss": 0.4532501995563507, "num_input_tokens_seen": 674508, "step": 204, "train_runtime": 1534.0098, "train_tokens_per_second": 439.703 }, { "epoch": 1.653225806451613, "grad_norm": 0.710063636302948, "learning_rate": 0.0001875816032543445, "loss": 0.37533634901046753, "num_input_tokens_seen": 677700, "step": 205, "train_runtime": 1540.4655, "train_tokens_per_second": 439.932 }, { "epoch": 1.661290322580645, "grad_norm": 0.6659166216850281, "learning_rate": 0.00018745854477732733, "loss": 0.3989948630332947, "num_input_tokens_seen": 680978, "step": 206, "train_runtime": 1546.9747, "train_tokens_per_second": 440.2 }, { "epoch": 1.6693548387096775, "grad_norm": 0.5847692489624023, "learning_rate": 0.00018733492036363005, "loss": 0.39292144775390625, "num_input_tokens_seen": 684328, "step": 207, "train_runtime": 1553.5704, "train_tokens_per_second": 440.487 }, { "epoch": 1.6774193548387095, "grad_norm": 0.6828010082244873, "learning_rate": 0.0001872107308132155, "loss": 0.4327988028526306, "num_input_tokens_seen": 687676, "step": 208, "train_runtime": 1560.1397, "train_tokens_per_second": 440.778 }, { "epoch": 1.685483870967742, "grad_norm": 0.6712137460708618, "learning_rate": 0.00018708597692970353, "loss": 0.38148653507232666, "num_input_tokens_seen": 691000, "step": 209, "train_runtime": 1566.7698, "train_tokens_per_second": 441.035 }, { "epoch": 1.6935483870967742, "grad_norm": 0.8174690008163452, "learning_rate": 0.00018696065952036571, "loss": 0.4306122660636902, "num_input_tokens_seen": 694156, "step": 210, "train_runtime": 1573.041, "train_tokens_per_second": 441.283 }, { "epoch": 1.6935483870967742, "eval_loss": 2.281032085418701, "eval_runtime": 16.9723, "eval_samples_per_second": 3.064, "eval_steps_per_second": 1.532, "num_input_tokens_seen": 694156, "step": 210 }, { "epoch": 1.7016129032258065, "grad_norm": 0.641558825969696, "learning_rate": 0.00018683477939612021, "loss": 0.390179306268692, "num_input_tokens_seen": 697510, "step": 211, "train_runtime": 1600.5748, "train_tokens_per_second": 435.787 }, { "epoch": 1.7096774193548387, "grad_norm": 0.6440213918685913, "learning_rate": 0.0001867083373715264, "loss": 0.3886089622974396, "num_input_tokens_seen": 700884, "step": 212, "train_runtime": 1607.2259, "train_tokens_per_second": 436.083 }, { "epoch": 1.717741935483871, "grad_norm": 0.6989295482635498, "learning_rate": 0.00018658133426477965, "loss": 0.41528093814849854, "num_input_tokens_seen": 704288, "step": 213, "train_runtime": 1613.9043, "train_tokens_per_second": 436.388 }, { "epoch": 1.7258064516129032, "grad_norm": 0.785047173500061, "learning_rate": 0.00018645377089770616, "loss": 0.4055544435977936, "num_input_tokens_seen": 707404, "step": 214, "train_runtime": 1620.1995, "train_tokens_per_second": 436.615 }, { "epoch": 1.7338709677419355, "grad_norm": 0.7163392305374146, "learning_rate": 0.00018632564809575742, "loss": 0.417140930891037, "num_input_tokens_seen": 710738, "step": 215, "train_runtime": 1626.8303, "train_tokens_per_second": 436.885 }, { "epoch": 1.7419354838709677, "grad_norm": 0.8293558955192566, "learning_rate": 0.00018619696668800492, "loss": 0.4728226959705353, "num_input_tokens_seen": 714052, "step": 216, "train_runtime": 1633.4113, "train_tokens_per_second": 437.154 }, { "epoch": 1.75, "grad_norm": 0.6450289487838745, "learning_rate": 0.00018606772750713504, "loss": 0.38246041536331177, "num_input_tokens_seen": 717406, "step": 217, "train_runtime": 1640.0928, "train_tokens_per_second": 437.418 }, { "epoch": 1.7580645161290323, "grad_norm": 0.64801424741745, "learning_rate": 0.00018593793138944328, "loss": 0.37994423508644104, "num_input_tokens_seen": 720724, "step": 218, "train_runtime": 1646.6497, "train_tokens_per_second": 437.691 }, { "epoch": 1.7661290322580645, "grad_norm": 0.7920657396316528, "learning_rate": 0.0001858075791748291, "loss": 0.48340174555778503, "num_input_tokens_seen": 724156, "step": 219, "train_runtime": 1653.296, "train_tokens_per_second": 438.007 }, { "epoch": 1.7741935483870968, "grad_norm": 0.7000712156295776, "learning_rate": 0.0001856766717067904, "loss": 0.42204493284225464, "num_input_tokens_seen": 727386, "step": 220, "train_runtime": 1659.7156, "train_tokens_per_second": 438.259 }, { "epoch": 1.782258064516129, "grad_norm": 0.6280332803726196, "learning_rate": 0.00018554520983241814, "loss": 0.4031910002231598, "num_input_tokens_seen": 730576, "step": 221, "train_runtime": 1666.2385, "train_tokens_per_second": 438.458 }, { "epoch": 1.7903225806451613, "grad_norm": 0.5951255559921265, "learning_rate": 0.00018541319440239066, "loss": 0.37358546257019043, "num_input_tokens_seen": 734042, "step": 222, "train_runtime": 1672.8824, "train_tokens_per_second": 438.789 }, { "epoch": 1.7983870967741935, "grad_norm": 0.6634789109230042, "learning_rate": 0.00018528062627096845, "loss": 0.4016473889350891, "num_input_tokens_seen": 737388, "step": 223, "train_runtime": 1679.4447, "train_tokens_per_second": 439.067 }, { "epoch": 1.8064516129032258, "grad_norm": 0.6808019876480103, "learning_rate": 0.0001851475062959884, "loss": 0.40531060099601746, "num_input_tokens_seen": 740714, "step": 224, "train_runtime": 1686.0689, "train_tokens_per_second": 439.314 }, { "epoch": 1.814516129032258, "grad_norm": 0.6066514849662781, "learning_rate": 0.00018501383533885837, "loss": 0.3812728524208069, "num_input_tokens_seen": 743952, "step": 225, "train_runtime": 1692.602, "train_tokens_per_second": 439.532 }, { "epoch": 1.8225806451612905, "grad_norm": 0.6962650418281555, "learning_rate": 0.00018487961426455157, "loss": 0.4498329162597656, "num_input_tokens_seen": 747324, "step": 226, "train_runtime": 1699.2428, "train_tokens_per_second": 439.798 }, { "epoch": 1.8306451612903225, "grad_norm": 0.7276473641395569, "learning_rate": 0.0001847448439416009, "loss": 0.4539956748485565, "num_input_tokens_seen": 750660, "step": 227, "train_runtime": 1705.825, "train_tokens_per_second": 440.057 }, { "epoch": 1.838709677419355, "grad_norm": 0.7871230244636536, "learning_rate": 0.00018460952524209355, "loss": 0.4766373038291931, "num_input_tokens_seen": 753890, "step": 228, "train_runtime": 1712.1642, "train_tokens_per_second": 440.314 }, { "epoch": 1.846774193548387, "grad_norm": 0.6592321991920471, "learning_rate": 0.0001844736590416651, "loss": 0.3986653983592987, "num_input_tokens_seen": 757196, "step": 229, "train_runtime": 1718.7529, "train_tokens_per_second": 440.55 }, { "epoch": 1.8548387096774195, "grad_norm": 0.6650552153587341, "learning_rate": 0.00018433724621949392, "loss": 0.39142435789108276, "num_input_tokens_seen": 760606, "step": 230, "train_runtime": 1725.4681, "train_tokens_per_second": 440.811 }, { "epoch": 1.8629032258064515, "grad_norm": 0.6583918929100037, "learning_rate": 0.00018420028765829568, "loss": 0.4003082811832428, "num_input_tokens_seen": 763910, "step": 231, "train_runtime": 1732.022, "train_tokens_per_second": 441.051 }, { "epoch": 1.870967741935484, "grad_norm": 0.7142671942710876, "learning_rate": 0.00018406278424431736, "loss": 0.4203531742095947, "num_input_tokens_seen": 767166, "step": 232, "train_runtime": 1738.4018, "train_tokens_per_second": 441.305 }, { "epoch": 1.879032258064516, "grad_norm": 0.6280651688575745, "learning_rate": 0.00018392473686733163, "loss": 0.40257829427719116, "num_input_tokens_seen": 770592, "step": 233, "train_runtime": 1745.0525, "train_tokens_per_second": 441.587 }, { "epoch": 1.8870967741935485, "grad_norm": 0.8258600234985352, "learning_rate": 0.00018378614642063115, "loss": 0.4191328287124634, "num_input_tokens_seen": 773594, "step": 234, "train_runtime": 1751.2771, "train_tokens_per_second": 441.731 }, { "epoch": 1.8951612903225805, "grad_norm": 0.7963438034057617, "learning_rate": 0.00018364701380102266, "loss": 0.42885884642601013, "num_input_tokens_seen": 776592, "step": 235, "train_runtime": 1757.3069, "train_tokens_per_second": 441.922 }, { "epoch": 1.903225806451613, "grad_norm": 0.7623209357261658, "learning_rate": 0.0001835073399088214, "loss": 0.4639081358909607, "num_input_tokens_seen": 779942, "step": 236, "train_runtime": 1763.9906, "train_tokens_per_second": 442.146 }, { "epoch": 1.911290322580645, "grad_norm": 0.6425145864486694, "learning_rate": 0.00018336712564784503, "loss": 0.3997993469238281, "num_input_tokens_seen": 783252, "step": 237, "train_runtime": 1770.5413, "train_tokens_per_second": 442.38 }, { "epoch": 1.9193548387096775, "grad_norm": 0.7987529039382935, "learning_rate": 0.00018322637192540785, "loss": 0.4364495277404785, "num_input_tokens_seen": 786556, "step": 238, "train_runtime": 1777.0794, "train_tokens_per_second": 442.612 }, { "epoch": 1.9274193548387095, "grad_norm": 0.7024737000465393, "learning_rate": 0.00018308507965231508, "loss": 0.4188304543495178, "num_input_tokens_seen": 789882, "step": 239, "train_runtime": 1783.6117, "train_tokens_per_second": 442.855 }, { "epoch": 1.935483870967742, "grad_norm": 0.6291638612747192, "learning_rate": 0.00018294324974285677, "loss": 0.37665027379989624, "num_input_tokens_seen": 793254, "step": 240, "train_runtime": 1790.3001, "train_tokens_per_second": 443.084 }, { "epoch": 1.935483870967742, "eval_loss": 2.1662468910217285, "eval_runtime": 16.9567, "eval_samples_per_second": 3.067, "eval_steps_per_second": 1.533, "num_input_tokens_seen": 793254, "step": 240 } ], "logging_steps": 1, "max_steps": 1240, "num_input_tokens_seen": 793254, "num_train_epochs": 10, "save_steps": 30, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.1549679756665984e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }