Jellyfish042 commited on
Commit
c807fbd
Β·
1 Parent(s): 14e0ea5

Long Context Page

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. .gitignore +4 -1
  2. __pycache__/data_manager.cpython-311.pyc +0 -0
  3. __pycache__/longctx_utils.cpython-311.pyc +0 -0
  4. __pycache__/title.cpython-311.pyc +0 -0
  5. about.md +2 -0
  6. app.py +168 -3
  7. data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-ao3_english-2025-12-23_03-28-52.json +27 -0
  8. data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-arxiv_cs-2025-12-23_03-29-16.json +27 -0
  9. data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-arxiv_math-2025-12-23_03-29-55.json +27 -0
  10. data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-arxiv_physics-2025-12-23_03-30-25.json +27 -0
  11. data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-bbc_news-2025-12-23_03-30-34.json +27 -0
  12. data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-github_cpp-2025-12-23_03-30-59.json +27 -0
  13. data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-github_javascript-2025-12-23_03-31-21.json +27 -0
  14. data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-github_markdown-2025-12-23_03-31-40.json +27 -0
  15. data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-github_python-2025-12-23_03-32-04.json +27 -0
  16. data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-wikipedia_english-2025-12-23_03-32-14.json +27 -0
  17. data/2025-12/{2025-12-21_12-21-34.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-ao3_english-2025-12-22_17-27-04.json} +6 -3
  18. data/2025-12/{2025-12-21_12-22-03.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-arxiv_cs-2025-12-22_17-27-33.json} +6 -3
  19. data/2025-12/{2025-12-21_12-22-37.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-arxiv_math-2025-12-22_17-28-09.json} +6 -3
  20. data/2025-12/{2025-12-21_12-23-07.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-arxiv_physics-2025-12-22_17-28-38.json} +6 -3
  21. data/2025-12/{2025-12-21_12-23-34.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-bbc_news-2025-12-22_17-29-05.json} +6 -3
  22. data/2025-12/{2025-12-21_12-24-04.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-github_cpp-2025-12-22_17-29-35.json} +6 -3
  23. data/2025-12/{2025-12-21_12-24-34.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-github_javascript-2025-12-22_17-30-05.json} +6 -3
  24. data/2025-12/{2025-12-21_12-25-02.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-github_markdown-2025-12-22_17-30-33.json} +6 -3
  25. data/2025-12/{2025-12-21_12-25-30.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-github_python-2025-12-22_17-31-02.json} +6 -3
  26. data/2025-12/{2025-12-21_12-25-56.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-wikipedia_english-2025-12-22_17-31-27.json} +6 -3
  27. data/2025-12/{2025-12-21_12-03-11.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-ao3_english-2025-12-22_17-15-28.json} +4 -1
  28. data/2025-12/{2025-12-21_12-04-22.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-arxiv_cs-2025-12-22_17-16-37.json} +4 -1
  29. data/2025-12/{2025-12-21_12-05-45.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-arxiv_math-2025-12-22_17-17-57.json} +4 -1
  30. data/2025-12/{2025-12-21_12-06-56.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-arxiv_physics-2025-12-22_17-19-07.json} +4 -1
  31. data/2025-12/{2025-12-21_12-08-06.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-bbc_news-2025-12-22_17-20-15.json} +4 -1
  32. data/2025-12/{2025-12-21_12-09-22.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-github_cpp-2025-12-22_17-21-28.json} +4 -1
  33. data/2025-12/{2025-12-21_12-10-36.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-github_javascript-2025-12-22_17-22-39.json} +4 -1
  34. data/2025-12/{2025-12-21_12-11-47.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-github_markdown-2025-12-22_17-23-48.json} +4 -1
  35. data/2025-12/{2025-12-21_12-12-57.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-github_python-2025-12-22_17-24-57.json} +4 -1
  36. data/2025-12/{2025-12-21_12-14-03.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-wikipedia_english-2025-12-22_17-26-03.json} +4 -1
  37. data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-ao3_english-2025-12-22_18-25-37.json +29 -0
  38. data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-arxiv_cs-2025-12-22_18-26-15.json +29 -0
  39. data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-arxiv_math-2025-12-22_18-27-06.json +29 -0
  40. data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-arxiv_physics-2025-12-22_18-27-47.json +29 -0
  41. data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-bbc_news-2025-12-22_18-28-23.json +29 -0
  42. data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-github_cpp-2025-12-22_18-29-05.json +29 -0
  43. data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-github_javascript-2025-12-22_18-29-46.json +29 -0
  44. data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-github_markdown-2025-12-22_18-30-25.json +29 -0
  45. data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-github_python-2025-12-22_18-31-04.json +29 -0
  46. data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-wikipedia_english-2025-12-22_18-31-38.json +29 -0
  47. data/2025-12/Falcon-H1-7B-Base-UncheatableEval-2025-12-ao3_english-2025-12-22_21-02-16.json +28 -0
  48. data/2025-12/Falcon-H1-7B-Base-UncheatableEval-2025-12-arxiv_cs-2025-12-22_21-03-07.json +28 -0
  49. data/2025-12/Falcon-H1-7B-Base-UncheatableEval-2025-12-arxiv_math-2025-12-22_21-04-23.json +28 -0
  50. data/2025-12/Falcon-H1-7B-Base-UncheatableEval-2025-12-arxiv_physics-2025-12-22_21-05-22.json +28 -0
.gitignore CHANGED
@@ -1,2 +1,5 @@
1
  /.env
2
- /.idea
 
 
 
 
1
  /.env
2
+ /.idea
3
+ /.logs
4
+ /.__pycache__
5
+ /.temp
__pycache__/data_manager.cpython-311.pyc CHANGED
Binary files a/__pycache__/data_manager.cpython-311.pyc and b/__pycache__/data_manager.cpython-311.pyc differ
 
__pycache__/longctx_utils.cpython-311.pyc ADDED
Binary file (10.1 kB). View file
 
__pycache__/title.cpython-311.pyc CHANGED
Binary files a/__pycache__/title.cpython-311.pyc and b/__pycache__/title.cpython-311.pyc differ
 
about.md CHANGED
@@ -2,6 +2,8 @@
2
 
3
  GitHub page: [https://github.com/Jellyfish042/uncheatable_eval](https://github.com/Jellyfish042/uncheatable_eval)
4
 
 
 
5
  ## Introduction
6
  Traditional LLM benchmarks are easily compromised by unintentional or intentional data leakage, making many benchmarks unreliable and unable to truly reflect the capabilities of LLMs.
7
 
 
2
 
3
  GitHub page: [https://github.com/Jellyfish042/uncheatable_eval](https://github.com/Jellyfish042/uncheatable_eval)
4
 
5
+ Dataset page: [https://huggingface.co/collections/Jellyfish042/uncheatableeval](https://huggingface.co/collections/Jellyfish042/uncheatableeval)
6
+
7
  ## Introduction
8
  Traditional LLM benchmarks are easily compromised by unintentional or intentional data leakage, making many benchmarks unreliable and unable to truly reflect the capabilities of LLMs.
9
 
app.py CHANGED
@@ -1,4 +1,3 @@
1
- from operator import is_
2
  import pandas as pd
3
  import gradio as gr
4
  import os
@@ -13,7 +12,11 @@ from huggingface_hub.utils import GatedRepoError
13
  from gradio_rangeslider import RangeSlider
14
  import datetime
15
  from title import css, TITLE_HTML, SUBTITLE_HTML
16
- from data_manager import DataManager
 
 
 
 
17
 
18
  load_dotenv()
19
  webhook_url = os.environ.get("WEBHOOK_URL")
@@ -50,6 +53,11 @@ def read_about_md():
50
  return f.read()
51
 
52
 
 
 
 
 
 
53
  def update_table(
54
  data_manager: DataManager,
55
  period: str,
@@ -282,6 +290,30 @@ if __name__ == "__main__":
282
  time_list = data_manager.get_available_periods()
283
  last_period = time_list[-1]
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  initial_fig = create_scaling_plot(data_manager, last_period) if last_period else go.Figure()
286
  initial_metric = metric_list[0]
287
  initial_columns = data_manager.get_available_columns(last_period)
@@ -337,7 +369,140 @@ if __name__ == "__main__":
337
  midpoint_slider.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
338
 
339
  with gr.Tab("πŸ“š Long Context"):
340
- gr.Markdown("## Coming soon...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
  with gr.Tab("πŸ“ˆ Scaling Law"):
343
  period_selector_2 = gr.Dropdown(label="Period", choices=time_list, value=last_period)
 
 
1
  import pandas as pd
2
  import gradio as gr
3
  import os
 
12
  from gradio_rangeslider import RangeSlider
13
  import datetime
14
  from title import css, TITLE_HTML, SUBTITLE_HTML
15
+ from data_manager import DataManager, LongContextDataManager
16
+ import matplotlib.pyplot as plt
17
+ from matplotlib.ticker import ScalarFormatter
18
+ from collections import defaultdict
19
+ from longctx_utils import *
20
 
21
  load_dotenv()
22
  webhook_url = os.environ.get("WEBHOOK_URL")
 
53
  return f.read()
54
 
55
 
56
+ def read_longctx_about_md():
57
+ with open("longctx_about.md", "r", encoding="utf-8") as f:
58
+ return f.read()
59
+
60
+
61
  def update_table(
62
  data_manager: DataManager,
63
  period: str,
 
290
  time_list = data_manager.get_available_periods()
291
  last_period = time_list[-1]
292
 
293
+ # Long Context Data
294
+ lc_dm = LongContextDataManager("longctx_data")
295
+ lc_periods = lc_dm.get_available_periods()
296
+ default_lc_period = lc_periods[-1]
297
+ MODE_ABS_AVG = "Absolute (Averaged by Model)"
298
+ MODE_ABS_SINGLE = "Absolute (Single Dataset)"
299
+ MODE_REL_AVG = "Relative (Averaged by Model)"
300
+ MODE_REL_SINGLE = "Relative (Single Dataset)"
301
+ lc_modes = [MODE_ABS_AVG, MODE_ABS_SINGLE, MODE_REL_AVG, MODE_REL_SINGLE]
302
+ default_lc_mode = MODE_ABS_AVG
303
+ init_lc_choices = lc_dm.get_model_choices(default_lc_period)
304
+ print(init_lc_choices)
305
+
306
+ def create_initial_lc_plot():
307
+ if not init_lc_choices:
308
+ return None
309
+ default_model = init_lc_choices[0][1]
310
+ data_map = {}
311
+ paths = lc_dm.get_paths_for_model(default_lc_period, default_model)
312
+ data_map[default_model] = paths
313
+ return draw_long_context_plot(default_lc_mode, data_map, None, 0.2, 32, 32, [None, None])
314
+
315
+ initial_lc_plot = create_initial_lc_plot()
316
+
317
  initial_fig = create_scaling_plot(data_manager, last_period) if last_period else go.Figure()
318
  initial_metric = metric_list[0]
319
  initial_columns = data_manager.get_available_columns(last_period)
 
369
  midpoint_slider.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
370
 
371
  with gr.Tab("πŸ“š Long Context"):
372
+ gr.Markdown(read_longctx_about_md())
373
+ with gr.Row():
374
+ with gr.Column(scale=1):
375
+ lc_period_dropdown = gr.Dropdown(label="Period", choices=lc_periods, value=default_lc_period)
376
+
377
+ lc_mode_radio = gr.Radio(label="Visualization Mode", choices=lc_modes, value=default_lc_mode)
378
+
379
+ gr.Markdown("### Model / Dataset Selection")
380
+
381
+ default_selected_models = [init_lc_choices[0][1]] if init_lc_choices else []
382
+ lc_select_abs = gr.Dropdown(
383
+ label="Select Models", choices=init_lc_choices, value=default_selected_models, multiselect=True, visible=True
384
+ )
385
+
386
+ lc_select_base = gr.Dropdown(
387
+ label="Baseline Model",
388
+ choices=init_lc_choices,
389
+ value=None,
390
+ multiselect=False,
391
+ visible=False,
392
+ )
393
+
394
+ lc_select_comp = gr.Dropdown(label="Comparison Models", choices=init_lc_choices, value=[], multiselect=True, visible=False)
395
+
396
+ with gr.Accordion("Advanced Settings", open=True):
397
+ lc_smooth = gr.Slider(1, 125, 32, step=1, label="Smooth Window")
398
+ lc_cutoff = gr.Slider(0.1, 1.0, 0.2, step=0.05, label="Cutoff Ratio")
399
+ lc_offset = gr.Number(32, label="Start Offset (Bytes)")
400
+ with gr.Row():
401
+ lc_ymin = gr.Textbox(label="Y Min", placeholder="Auto", value="")
402
+ lc_ymax = gr.Textbox(label="Y Max", placeholder="Auto", value="")
403
+
404
+ lc_btn_plot = gr.Button("Visualize", variant="primary")
405
+
406
+ with gr.Column(scale=3):
407
+ lc_plot_output = gr.Plot(label="Visualization Result", value=initial_lc_plot)
408
+
409
+ def update_lc_inputs(period, mode):
410
+ if not period:
411
+ return tuple([gr.update()] * 3)
412
+
413
+ is_model_agg = "Averaged by Model" in mode
414
+ is_relative = "Relative" in mode
415
+
416
+ if is_model_agg:
417
+ choices = lc_dm.get_model_choices(period)
418
+ label_suffix = "Models"
419
+ else:
420
+ choices = lc_dm.get_file_choices(period)
421
+ label_suffix = "Datasets"
422
+ if not is_relative:
423
+ return (
424
+ gr.update(visible=True, choices=choices, label=f"Select {label_suffix}", value=[]),
425
+ gr.update(visible=False, choices=choices, value=None),
426
+ gr.update(visible=False, choices=choices, value=[]),
427
+ )
428
+ else:
429
+ return (
430
+ gr.update(visible=False, choices=choices, value=[]),
431
+ gr.update(visible=True, choices=choices, label=f"Baseline", value=None),
432
+ gr.update(visible=True, choices=choices, label=f"Comparison", value=[]),
433
+ )
434
+
435
+ lc_period_dropdown.change(
436
+ fn=update_lc_inputs, inputs=[lc_period_dropdown, lc_mode_radio], outputs=[lc_select_abs, lc_select_base, lc_select_comp]
437
+ )
438
+ lc_mode_radio.change(
439
+ fn=update_lc_inputs, inputs=[lc_period_dropdown, lc_mode_radio], outputs=[lc_select_abs, lc_select_base, lc_select_comp]
440
+ )
441
+
442
+ def run_lc_plot(mode, period, sel_abs, sel_base, sel_comp, smooth, cutoff, offset, ymin, ymax):
443
+ data_map = {}
444
+ baseline_key = None
445
+
446
+ is_model_agg = "Averaged by Model" in mode
447
+ is_relative = "Relative" in mode
448
+
449
+ if not is_relative:
450
+ selection = sel_abs
451
+ else:
452
+ if not sel_base:
453
+ return None
454
+ selection = [sel_base] + sel_comp
455
+ if is_model_agg:
456
+ baseline_key = sel_base
457
+ else:
458
+ baseline_key = os.path.basename(sel_base)
459
+
460
+ if not selection:
461
+ return None
462
+
463
+ for item in selection:
464
+ if is_model_agg:
465
+ paths = lc_dm.get_paths_for_model(period, item)
466
+ if paths:
467
+ data_map[item] = paths
468
+ else:
469
+ fname = os.path.basename(item)
470
+ data_map[fname] = [item]
471
+
472
+ def _to_float_or_none(val):
473
+ if val is None:
474
+ return None
475
+ s = str(val).strip()
476
+ if not s:
477
+ return None
478
+ try:
479
+ return float(s)
480
+ except ValueError:
481
+ return None
482
+
483
+ ymin = _to_float_or_none(ymin)
484
+ ymax = _to_float_or_none(ymax)
485
+
486
+ y_range = [ymin, ymax]
487
+
488
+ return draw_long_context_plot(mode, data_map, baseline_key, cutoff, smooth, int(offset), y_range)
489
+
490
+ lc_btn_plot.click(
491
+ fn=run_lc_plot,
492
+ inputs=[
493
+ lc_mode_radio,
494
+ lc_period_dropdown,
495
+ lc_select_abs,
496
+ lc_select_base,
497
+ lc_select_comp,
498
+ lc_smooth,
499
+ lc_cutoff,
500
+ lc_offset,
501
+ lc_ymin,
502
+ lc_ymax,
503
+ ],
504
+ outputs=lc_plot_output,
505
+ )
506
 
507
  with gr.Tab("πŸ“ˆ Scaling Law"):
508
  period_selector_2 = gr.Dropdown(label="Period", choices=time_list, value=last_period)
data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-ao3_english-2025-12-23_03-28-52.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5917.968648071289,
3
+ "avg tokens": 1989.774,
4
+ "avg character count": 7857.404,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 8012.242,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "UncheatableEval-2025-12-ao3_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "enable_chunking": true,
23
+ "bpc": 1.0865960335893141,
24
+ "bpb": 1.0655973722097776,
25
+ "compression_rate": 13.319967152622219,
26
+ "track_byte_wise_data": false
27
+ }
data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-arxiv_cs-2025-12-23_03-29-16.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5052.083911132812,
3
+ "avg tokens": 2184.846,
4
+ "avg character count": 9964.74,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 9994.128,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_cs",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "enable_chunking": true,
23
+ "bpc": 0.7314407003841773,
24
+ "bpb": 0.7292898794918602,
25
+ "compression_rate": 9.116123493648253,
26
+ "track_byte_wise_data": false
27
+ }
data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-arxiv_math-2025-12-23_03-29-55.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4687.764853004455,
3
+ "avg tokens": 2998.584,
4
+ "avg character count": 9913.284,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 9918.674,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_math",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "enable_chunking": true,
23
+ "bpc": 0.6822174272706308,
24
+ "bpb": 0.6818466970769588,
25
+ "compression_rate": 8.523083713461984,
26
+ "track_byte_wise_data": false
27
+ }
data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-arxiv_physics-2025-12-23_03-30-25.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5154.191329101563,
3
+ "avg tokens": 2540.232,
4
+ "avg character count": 9946.974,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 9952.8,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_physics",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "enable_chunking": true,
23
+ "bpc": 0.747556620766046,
24
+ "bpb": 0.74711902884492,
25
+ "compression_rate": 9.3389878605615,
26
+ "track_byte_wise_data": false
27
+ }
data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-bbc_news-2025-12-23_03-30-34.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 2131.0125563964843,
3
+ "avg tokens": 779.508,
4
+ "avg character count": 3394.84,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 3396.996,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "UncheatableEval-2025-12-bbc_news",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "enable_chunking": true,
23
+ "bpc": 0.9056100573768778,
24
+ "bpb": 0.9050352862309289,
25
+ "compression_rate": 11.312941077886611,
26
+ "track_byte_wise_data": false
27
+ }
data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-github_cpp-2025-12-23_03-30-59.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1737.0164586486817,
3
+ "avg tokens": 1978.696,
4
+ "avg character count": 5773.33,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 5853.154,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "UncheatableEval-2025-12-github_cpp",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "enable_chunking": true,
23
+ "bpc": 0.4340623229288751,
24
+ "bpb": 0.4281426784319978,
25
+ "compression_rate": 5.351783480399972,
26
+ "track_byte_wise_data": false
27
+ }
data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-github_javascript-2025-12-23_03-31-21.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1777.4683681259155,
3
+ "avg tokens": 1832.826,
4
+ "avg character count": 5774.754,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 5870.628,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "UncheatableEval-2025-12-github_javascript",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "enable_chunking": true,
23
+ "bpc": 0.44406130547418243,
24
+ "bpb": 0.4368092817382156,
25
+ "compression_rate": 5.4601160217276945,
26
+ "track_byte_wise_data": false
27
+ }
data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-github_markdown-2025-12-23_03-31-40.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 2788.8940350341795,
3
+ "avg tokens": 1649.738,
4
+ "avg character count": 5024.17,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 5522.098,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "UncheatableEval-2025-12-github_markdown",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "enable_chunking": true,
23
+ "bpc": 0.8008334896925505,
24
+ "bpb": 0.7286222725327622,
25
+ "compression_rate": 9.107778406659527,
26
+ "track_byte_wise_data": false
27
+ }
data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-github_python-2025-12-23_03-32-04.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 2042.8134483337403,
3
+ "avg tokens": 1993.354,
4
+ "avg character count": 6339.622,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 6497.474,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "UncheatableEval-2025-12-github_python",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "enable_chunking": true,
23
+ "bpc": 0.46487895198994034,
24
+ "bpb": 0.4535850134024961,
25
+ "compression_rate": 5.669812667531201,
26
+ "track_byte_wise_data": false
27
+ }
data/2025-12/ERNIE-4.5-0.3B-Base-PT-UncheatableEval-2025-12-wikipedia_english-2025-12-23_03-32-14.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1901.4363048095704,
3
+ "avg tokens": 783.536,
4
+ "avg character count": 3043.39,
5
+ "parameters count": 0.360748032,
6
+ "avg bytes": 3062.292,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "baidu/ERNIE-4.5-0.3B-Base-PT",
9
+ "tokenizer_name": "baidu/ERNIE-4.5-0.3B-Base-PT",
10
+ "data_path": "UncheatableEval-2025-12-wikipedia_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "enable_chunking": true,
23
+ "bpc": 0.901360892792249,
24
+ "bpb": 0.8957972419073696,
25
+ "compression_rate": 11.197465523842121,
26
+ "track_byte_wise_data": false
27
+ }
data/2025-12/{2025-12-21_12-21-34.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-ao3_english-2025-12-22_17-27-04.json} RENAMED
@@ -1,5 +1,5 @@
1
  {
2
- "neg_log_prob_sum": 5113.168,
3
  "avg tokens": 2079.228,
4
  "avg character count": 7857.404,
5
  "parameters count": 1.554859392,
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-ao3_english",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
- "compression_rate": 11.508548601053453,
 
 
 
25
  "track_byte_wise_data": false
26
  }
 
1
  {
2
+ "neg_log_prob_sum": 5113.096,
3
  "avg tokens": 2079.228,
4
  "avg character count": 7857.404,
5
  "parameters count": 1.554859392,
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "UncheatableEval-2025-12-ao3_english",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.9388136645117389,
26
+ "bpb": 0.9206709236677068,
27
+ "compression_rate": 11.508386545846335,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-22-03.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-arxiv_cs-2025-12-22_17-27-33.json} RENAMED
@@ -1,5 +1,5 @@
1
  {
2
- "neg_log_prob_sum": 4357.728,
3
  "avg tokens": 2317.894,
4
  "avg character count": 9964.74,
5
  "parameters count": 1.554859392,
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_cs",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
- "compression_rate": 7.863207994663192,
 
 
 
25
  "track_byte_wise_data": false
26
  }
 
1
  {
2
+ "neg_log_prob_sum": 4357.824,
3
  "avg tokens": 2317.894,
4
  "avg character count": 9964.74,
5
  "parameters count": 1.554859392,
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_cs",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.6309257515867855,
26
+ "bpb": 0.6290704975828711,
27
+ "compression_rate": 7.863381219785889,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-22-37.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-arxiv_math-2025-12-22_17-28-09.json} RENAMED
@@ -1,5 +1,5 @@
1
  {
2
- "neg_log_prob_sum": 3843.735,
3
  "avg tokens": 3293.932,
4
  "avg character count": 9913.284,
5
  "parameters count": 1.554859392,
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_math",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
- "compression_rate": 6.988506506756019,
 
 
 
25
  "track_byte_wise_data": false
26
  }
 
1
  {
2
+ "neg_log_prob_sum": 3843.726,
3
  "avg tokens": 3293.932,
4
  "avg character count": 9913.284,
5
  "parameters count": 1.554859392,
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_math",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.5593831911540083,
26
+ "bpb": 0.5590792114687883,
27
+ "compression_rate": 6.988490143359853,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-23-07.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-arxiv_physics-2025-12-22_17-28-38.json} RENAMED
@@ -1,5 +1,5 @@
1
  {
2
- "neg_log_prob_sum": 4405.662,
3
  "avg tokens": 2782.132,
4
  "avg character count": 9946.974,
5
  "parameters count": 1.554859392,
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_physics",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
- "compression_rate": 7.982711798731203,
 
 
 
25
  "track_byte_wise_data": false
26
  }
 
1
  {
2
+ "neg_log_prob_sum": 4405.5345,
3
  "avg tokens": 2782.132,
4
  "avg character count": 9946.974,
5
  "parameters count": 1.554859392,
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_physics",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.6389724931034543,
26
+ "bpb": 0.6385984623035968,
27
+ "compression_rate": 7.982480778794961,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-23-34.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-bbc_news-2025-12-22_17-29-05.json} RENAMED
@@ -1,5 +1,5 @@
1
  {
2
- "neg_log_prob_sum": 1803.196,
3
  "avg tokens": 822.958,
4
  "avg character count": 3394.84,
5
  "parameters count": 1.554859392,
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-bbc_news",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
- "compression_rate": 9.572655983959118,
 
 
 
25
  "track_byte_wise_data": false
26
  }
 
1
  {
2
+ "neg_log_prob_sum": 1803.28,
3
  "avg tokens": 822.958,
4
  "avg character count": 3394.84,
5
  "parameters count": 1.554859392,
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "UncheatableEval-2025-12-bbc_news",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7663345292662539,
26
+ "bpb": 0.7658481532902157,
27
+ "compression_rate": 9.573101916127696,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-24-04.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-github_cpp-2025-12-22_17-29-35.json} RENAMED
@@ -1,5 +1,5 @@
1
  {
2
- "neg_log_prob_sum": 1460.103,
3
  "avg tokens": 2142.084,
4
  "avg character count": 5773.33,
5
  "parameters count": 1.554859392,
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-github_cpp",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
- "compression_rate": 4.498607411677315,
 
 
 
25
  "track_byte_wise_data": false
26
  }
 
1
  {
2
+ "neg_log_prob_sum": 1460.314,
3
  "avg tokens": 2142.084,
4
  "avg character count": 5773.33,
5
  "parameters count": 1.554859392,
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "UncheatableEval-2025-12-github_cpp",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.3649172602190981,
26
+ "bpb": 0.35994060056180405,
27
+ "compression_rate": 4.49925750702255,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-24-34.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-github_javascript-2025-12-22_17-30-05.json} RENAMED
@@ -1,5 +1,5 @@
1
  {
2
- "neg_log_prob_sum": 1512.8945,
3
  "avg tokens": 2023.828,
4
  "avg character count": 5774.754,
5
  "parameters count": 1.554859392,
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-github_javascript",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
- "compression_rate": 4.6473848124472115,
 
 
 
25
  "track_byte_wise_data": false
26
  }
 
1
  {
2
+ "neg_log_prob_sum": 1512.8745,
3
  "avg tokens": 2023.828,
4
  "avg character count": 5774.754,
5
  "parameters count": 1.554859392,
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "UncheatableEval-2025-12-github_javascript",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.37795835781703774,
26
+ "bpb": 0.3717858700359433,
27
+ "compression_rate": 4.647323375449291,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-25-02.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-github_markdown-2025-12-22_17-30-33.json} RENAMED
@@ -1,5 +1,5 @@
1
  {
2
- "neg_log_prob_sum": 2469.5905,
3
  "avg tokens": 1759.972,
4
  "avg character count": 5024.17,
5
  "parameters count": 1.554859392,
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-github_markdown",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
- "compression_rate": 8.065018873661096,
 
 
 
25
  "track_byte_wise_data": false
26
  }
 
1
  {
2
+ "neg_log_prob_sum": 2469.5585,
3
  "avg tokens": 1759.972,
4
  "avg character count": 5024.17,
5
  "parameters count": 1.554859392,
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "UncheatableEval-2025-12-github_markdown",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7091359968184172,
26
+ "bpb": 0.6451931496208845,
27
+ "compression_rate": 8.064914370261056,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-25-30.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-github_python-2025-12-22_17-31-02.json} RENAMED
@@ -1,5 +1,5 @@
1
  {
2
- "neg_log_prob_sum": 1762.199,
3
  "avg tokens": 2101.646,
4
  "avg character count": 6339.622,
5
  "parameters count": 1.554859392,
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-github_python",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
- "compression_rate": 4.890969472058469,
 
 
 
25
  "track_byte_wise_data": false
26
  }
 
1
  {
2
+ "neg_log_prob_sum": 1762.227,
3
  "avg tokens": 2101.646,
4
  "avg character count": 6339.622,
5
  "parameters count": 1.554859392,
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "UncheatableEval-2025-12-github_python",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.40102645770057516,
26
+ "bpb": 0.39128377486706917,
27
+ "compression_rate": 4.891047185838365,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-25-56.json β†’ Falcon-H1-1.5B-Base-UncheatableEval-2025-12-wikipedia_english-2025-12-22_17-31-27.json} RENAMED
@@ -1,5 +1,5 @@
1
  {
2
- "neg_log_prob_sum": 1605.912,
3
  "avg tokens": 843.83,
4
  "avg character count": 3043.39,
5
  "parameters count": 1.554859392,
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-wikipedia_english",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
- "compression_rate": 9.457137327629423,
 
 
 
25
  "track_byte_wise_data": false
26
  }
 
1
  {
2
+ "neg_log_prob_sum": 1605.888,
3
  "avg tokens": 843.83,
4
  "avg character count": 3043.39,
5
  "parameters count": 1.554859392,
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Base",
10
+ "data_path": "UncheatableEval-2025-12-wikipedia_english",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7612585484683514,
26
+ "bpb": 0.7565596794241358,
27
+ "compression_rate": 9.456995992801698,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-03-11.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-ao3_english-2025-12-22_17-15-28.json} RENAMED
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-ao3_english",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
 
 
 
24
  "compression_rate": 11.348654130029065,
25
  "track_byte_wise_data": false
26
  }
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "UncheatableEval-2025-12-ao3_english",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.9257832562927127,
26
+ "bpb": 0.9078923304023252,
27
  "compression_rate": 11.348654130029065,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-04-22.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-arxiv_cs-2025-12-22_17-16-37.json} RENAMED
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_cs",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
 
 
 
24
  "compression_rate": 7.7344042893665215,
25
  "track_byte_wise_data": false
26
  }
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_cs",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.6205771668637862,
26
+ "bpb": 0.6187523431493217,
27
  "compression_rate": 7.7344042893665215,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-05-45.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-arxiv_math-2025-12-22_17-17-57.json} RENAMED
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_math",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
 
 
 
24
  "compression_rate": 6.823898923027586,
25
  "track_byte_wise_data": false
26
  }
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_math",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.5462087342718052,
26
+ "bpb": 0.5459119138422068,
27
  "compression_rate": 6.823898923027586,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-06-56.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-arxiv_physics-2025-12-22_17-19-07.json} RENAMED
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_physics",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
 
 
 
24
  "compression_rate": 7.839570940275909,
25
  "track_byte_wise_data": false
26
  }
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_physics",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.6275330097726449,
26
+ "bpb": 0.6271656752220727,
27
  "compression_rate": 7.839570940275909,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-08-06.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-bbc_news-2025-12-22_17-20-15.json} RENAMED
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-bbc_news",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
 
 
 
24
  "compression_rate": 9.445374202127635,
25
  "track_byte_wise_data": false
26
  }
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "UncheatableEval-2025-12-bbc_news",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7561098227458323,
26
+ "bpb": 0.7556299361702108,
27
  "compression_rate": 9.445374202127635,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-09-22.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-github_cpp-2025-12-22_17-21-28.json} RENAMED
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-github_cpp",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
 
 
 
24
  "compression_rate": 4.351616540690811,
25
  "track_byte_wise_data": false
26
  }
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "UncheatableEval-2025-12-github_cpp",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.3529426762247866,
26
+ "bpb": 0.3481293232552649,
27
  "compression_rate": 4.351616540690811,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-10-36.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-github_javascript-2025-12-22_17-22-39.json} RENAMED
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-github_javascript",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
 
 
 
24
  "compression_rate": 4.493982004379503,
25
  "track_byte_wise_data": false
26
  }
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "UncheatableEval-2025-12-github_javascript",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.36548738299718303,
26
+ "bpb": 0.35951856035036023,
27
  "compression_rate": 4.493982004379503,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-11-47.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-github_markdown-2025-12-22_17-23-48.json} RENAMED
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-github_markdown",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
 
 
 
24
  "compression_rate": 7.847379113017852,
25
  "track_byte_wise_data": false
26
  }
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "UncheatableEval-2025-12-github_markdown",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.6900084432690399,
26
+ "bpb": 0.6277903290414282,
27
  "compression_rate": 7.847379113017852,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-12-57.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-github_python-2025-12-22_17-24-57.json} RENAMED
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-github_python",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
 
 
 
24
  "compression_rate": 4.72963844048806,
25
  "track_byte_wise_data": false
26
  }
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "UncheatableEval-2025-12-github_python",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.387792241196358,
26
+ "bpb": 0.3783710752390448,
27
  "compression_rate": 4.72963844048806,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/{2025-12-21_12-14-03.json β†’ Falcon-H1-1.5B-Deep-Base-UncheatableEval-2025-12-wikipedia_english-2025-12-22_17-26-03.json} RENAMED
@@ -7,7 +7,7 @@
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
- "data_path": "Jellyfish042/UncheatableEval-2025-12-wikipedia_english",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
@@ -21,6 +21,9 @@
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
 
 
 
24
  "compression_rate": 9.298512539311618,
25
  "track_byte_wise_data": false
26
  }
 
7
  "sample_count": 500,
8
  "model_name_or_path": "tiiuae/Falcon-H1-1.5B-Deep-Base",
9
  "tokenizer_name": "tiiuae/Falcon-H1-1.5B-Deep-Base",
10
+ "data_path": "UncheatableEval-2025-12-wikipedia_english",
11
  "chunk_size": 4000,
12
  "ensure_bos_token": true,
13
  "model_args": {
 
21
  },
22
  "requirements": [],
23
  "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7485011269941388,
26
+ "bpb": 0.7438810031449294,
27
  "compression_rate": 9.298512539311618,
28
  "track_byte_wise_data": false
29
  }
data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-ao3_english-2025-12-22_18-25-37.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4851.168,
3
+ "avg tokens": 2079.228,
4
+ "avg character count": 7857.404,
5
+ "parameters count": 3.149387264,
6
+ "avg bytes": 8012.242,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-3B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-3B-Base",
10
+ "data_path": "UncheatableEval-2025-12-ao3_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.8907211613554846,
26
+ "bpb": 0.8735078166784316,
27
+ "compression_rate": 10.918847708480396,
28
+ "track_byte_wise_data": false
29
+ }
data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-arxiv_cs-2025-12-22_18-26-15.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4041.946,
3
+ "avg tokens": 2317.894,
4
+ "avg character count": 9964.74,
5
+ "parameters count": 3.149387264,
6
+ "avg bytes": 9994.128,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-3B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-3B-Base",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_cs",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.5851929352638385,
26
+ "bpb": 0.5834721598263481,
27
+ "compression_rate": 7.293401997829352,
28
+ "track_byte_wise_data": false
29
+ }
data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-arxiv_math-2025-12-22_18-27-06.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 3533.473,
3
+ "avg tokens": 3293.932,
4
+ "avg character count": 9913.284,
5
+ "parameters count": 3.149387264,
6
+ "avg bytes": 9918.674,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-3B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-3B-Base",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_math",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.5142316082354795,
26
+ "bpb": 0.5139521647969323,
27
+ "compression_rate": 6.424402059961654,
28
+ "track_byte_wise_data": false
29
+ }
data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-arxiv_physics-2025-12-22_18-27-47.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4002.526,
3
+ "avg tokens": 2782.132,
4
+ "avg character count": 9946.974,
5
+ "parameters count": 3.149387264,
6
+ "avg bytes": 9952.8,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-3B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-3B-Base",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_physics",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.580520710241038,
26
+ "bpb": 0.580180894947064,
27
+ "compression_rate": 7.2522611868383,
28
+ "track_byte_wise_data": false
29
+ }
data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-bbc_news-2025-12-22_18-28-23.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1688.372,
3
+ "avg tokens": 822.958,
4
+ "avg character count": 3394.84,
5
+ "parameters count": 3.149387264,
6
+ "avg bytes": 3396.996,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-3B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-3B-Base",
10
+ "data_path": "UncheatableEval-2025-12-bbc_news",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7175024188402932,
26
+ "bpb": 0.7170470355501687,
27
+ "compression_rate": 8.963087944377108,
28
+ "track_byte_wise_data": false
29
+ }
data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-github_cpp-2025-12-22_18-29-05.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1265.126,
3
+ "avg tokens": 2142.084,
4
+ "avg character count": 5773.33,
5
+ "parameters count": 3.149387264,
6
+ "avg bytes": 5853.154,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-3B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-3B-Base",
10
+ "data_path": "UncheatableEval-2025-12-github_cpp",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.3161418117966045,
26
+ "bpb": 0.3118303407529839,
27
+ "compression_rate": 3.8978792594122984,
28
+ "track_byte_wise_data": false
29
+ }
data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-github_javascript-2025-12-22_18-29-46.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1291.97525,
3
+ "avg tokens": 2023.828,
4
+ "avg character count": 5774.754,
5
+ "parameters count": 3.149387264,
6
+ "avg bytes": 5870.628,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-3B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-3B-Base",
10
+ "data_path": "UncheatableEval-2025-12-github_javascript",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.32277154769298894,
26
+ "bpb": 0.3175003229852545,
27
+ "compression_rate": 3.9687540373156813,
28
+ "track_byte_wise_data": false
29
+ }
data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-github_markdown-2025-12-22_18-30-25.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 2191.3145,
3
+ "avg tokens": 1759.972,
4
+ "avg character count": 5024.17,
5
+ "parameters count": 3.149387264,
6
+ "avg bytes": 5522.098,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-3B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-3B-Base",
10
+ "data_path": "UncheatableEval-2025-12-github_markdown",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.6292379760593447,
26
+ "bpb": 0.572499539518871,
27
+ "compression_rate": 7.1562442439858875,
28
+ "track_byte_wise_data": false
29
+ }
data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-github_python-2025-12-22_18-31-04.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1559.233,
3
+ "avg tokens": 2101.646,
4
+ "avg character count": 6339.622,
5
+ "parameters count": 3.149387264,
6
+ "avg bytes": 6497.474,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-3B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-3B-Base",
10
+ "data_path": "UncheatableEval-2025-12-github_python",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.35483152097876197,
26
+ "bpb": 0.3462111147640485,
27
+ "compression_rate": 4.327638934550606,
28
+ "track_byte_wise_data": false
29
+ }
data/2025-12/Falcon-H1-3B-Base-UncheatableEval-2025-12-wikipedia_english-2025-12-22_18-31-38.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1486.464,
3
+ "avg tokens": 843.83,
4
+ "avg character count": 3043.39,
5
+ "parameters count": 3.149387264,
6
+ "avg bytes": 3062.292,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-3B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-3B-Base",
10
+ "data_path": "UncheatableEval-2025-12-wikipedia_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "enable_chunking": true,
25
+ "bpc": 0.7046465425922974,
26
+ "bpb": 0.7002971112029721,
27
+ "compression_rate": 8.753713890037151,
28
+ "track_byte_wise_data": false
29
+ }
data/2025-12/Falcon-H1-7B-Base-UncheatableEval-2025-12-ao3_english-2025-12-22_21-02-16.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4548.98,
3
+ "avg tokens": 1989.214,
4
+ "avg character count": 7857.404,
5
+ "parameters count": 7.585648736,
6
+ "avg bytes": 8012.242,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-7B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-7B-Base",
10
+ "data_path": "UncheatableEval-2025-12-ao3_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.8352365344970266,
25
+ "bpb": 0.8190954400906857,
26
+ "compression_rate": 10.238693001133571,
27
+ "track_byte_wise_data": false
28
+ }
data/2025-12/Falcon-H1-7B-Base-UncheatableEval-2025-12-arxiv_cs-2025-12-22_21-03-07.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 3815.616,
3
+ "avg tokens": 2203.3,
4
+ "avg character count": 9964.74,
5
+ "parameters count": 7.585648736,
6
+ "avg bytes": 9994.128,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-7B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-7B-Base",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_cs",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.5524248782343125,
25
+ "bpb": 0.5508004581426796,
26
+ "compression_rate": 6.885005726783495,
27
+ "track_byte_wise_data": false
28
+ }
data/2025-12/Falcon-H1-7B-Base-UncheatableEval-2025-12-arxiv_math-2025-12-22_21-04-23.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 3289.001625,
3
+ "avg tokens": 3207.47,
4
+ "avg character count": 9913.284,
5
+ "parameters count": 7.585648736,
6
+ "avg bytes": 9918.674,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-7B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-7B-Base",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_math",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.4786533235467926,
25
+ "bpb": 0.47839321403881624,
26
+ "compression_rate": 5.979915175485203,
27
+ "track_byte_wise_data": false
28
+ }
data/2025-12/Falcon-H1-7B-Base-UncheatableEval-2025-12-arxiv_physics-2025-12-22_21-05-22.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 3717.061,
3
+ "avg tokens": 2669.262,
4
+ "avg character count": 9946.974,
5
+ "parameters count": 7.585648736,
6
+ "avg bytes": 9952.8,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "tiiuae/Falcon-H1-7B-Base",
9
+ "tokenizer_name": "tiiuae/Falcon-H1-7B-Base",
10
+ "data_path": "UncheatableEval-2025-12-arxiv_physics",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "torch_dtype": "torch.bfloat16"
17
+ },
18
+ "tokenizer_args": {
19
+ "trust_remote_code": true
20
+ },
21
+ "requirements": [],
22
+ "batch_size": 1,
23
+ "enable_chunking": true,
24
+ "bpc": 0.5391172703760733,
25
+ "bpb": 0.5388016911202648,
26
+ "compression_rate": 6.73502113900331,
27
+ "track_byte_wise_data": false
28
+ }