From bae00ca5df188af2d7532b961ebb1a3071b46e54 Mon Sep 17 00:00:00 2001 From: Deyan Ginev <d.ginev@jacobs-university.de> Date: Mon, 22 Jul 2019 14:55:11 -0400 Subject: [PATCH] add frequency breakdown of contents --- resources/arxmliv-statements-082018.md | 56 ++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/resources/arxmliv-statements-082018.md b/resources/arxmliv-statements-082018.md index 07ace63..cc56301 100644 --- a/resources/arxmliv-statements-082018.md +++ b/resources/arxmliv-statements-082018.md @@ -24,6 +24,7 @@ articles, the right of distribution was only given (or assumed) to arXiv itself. - each filename is a SHA-256 hash of its contents, as a guarantee for uniqueness and random order - two separate tar bundles over the same data, one with and one without lexemes for mathematical expressions - data is extracted from the separately distributed [arXMLiv 08.2018](https://sigmathling.kwarc.info/resources/arxmliv-dataset-082018/) dataset. + - see the bottom of this page for a full statement freqeuncy breakdown | file name | MD5 | size | size unpacked | | :------------------------------------------------ | :--------------------------------- | ----: | ------------: | @@ -104,3 +105,58 @@ nomath source: `definition/35b170bae4259a5c430846116142d4e4a45097e52daf818b78ea3 ### Generated via - [llamapun 0.3.2](https://github.com/KWARC/llamapun/releases/tag/0.3.2) + +### Contents Breakdown + + | **statement class** | **frequency** | **frequency (nomath)** | + | :------------------ | ------------: | ---------------------: | + | abstract | 1,030,774 | 1,030,691 | + | acknowledgement | 162,230 | 162,220 | + | affirmation | 36 | 22 | + | answer | 40 | 39 | + | assumption | 29,577 | 26,890 | + | bound | 47 | 37 | + | case | 3,256 | 2,208 | + | claim | 89,737 | 75,778 | + | comment | 325 | 322 | + | conclusion | 284,585 | 284,536 | + | condition | 3,950 | 3,508 | + | conjecture | 44,893 | 41,780 | + | constraint | 753 | 731 | + | convention | 2,176 | 2,160 | + | corollary | 436,768 | 402,728 | + | criterion | 236 | 219 | + | definition | 686,717 | 667,797 | + | demonstration | 23,043 | 22,842 | + | discussion | 116,650 | 116,643 | + | example | 295,152 | 289,005 | + | exercise | 404 | 404 | + | expansion | 5 | 2 | + | expectation | 13 | 13 | + | experiment | 154 | 153 | + | explanation | 16 | 16 | + | fact | 17,737 | 16,473 | + | hint | 9 | 9 | + | introduction | 688,530 | 688,187 | + | issue | 41 | 28 | + | keywords | 1,565 | 1,565 | + | lemma | 1,320,646 | 1,162,559 | + | method | 50,968 | 50,947 | + | notation | 16,611 | 16,077 | + | note | 4,462 | 4,415 | + | notice | 4 | 4 | + | observation | 18,776 | 18,013 | + | overview | 11,279 | 11,277 | + | principle | 236 | 232 | + | problem | 30,369 | 29,221 | + | proof | 2,125,750 | 2,096,644 | + | proposition | 829,068 | 763,268 | + | question | 27,240 | 26,673 | + | relatedwork | 26,300 | 26,299 | + | remark | 639,038 | 635,180 | + | result | 239,905 | 239,639 | + | rule | 775 | 712 | + | solution | 163 | 144 | + | step | 6,910 | 6,536 | + | summary | 117 | 117 | + | theorem | 1,287,653 | 1,212,044 | -- GitLab