From b182a6cef3c0bbf92cc90bf32dba2b0b6f0b5f11 Mon Sep 17 00:00:00 2001 From: jfschaefer <jfschaefer@outlook.com> Date: Sat, 14 Jul 2018 22:10:44 +0200 Subject: [PATCH] started cleaning up for paper --- README.md | 86 ++++++++++++++++++---------------- examples.gfs | 20 ++++++-- gf/MNlpEng.gf | 3 -- gf/MNlpGer.gf | 3 -- gf/MNlpLog.gf | 3 -- {gf => src}/DGrammar.gf | 0 {gf => src}/DGrammarEng.gf | 4 +- {gf => src}/DGrammarGer.gf | 9 ++-- {gf => src}/DGrammarLog.gf | 0 {gf => src}/DLexicon.gf | 0 {gf => src}/DLexiconEng.gf | 0 {gf => src}/DLexiconGer.gf | 2 +- {gf => src}/DLexiconLog.gf | 0 {gf => src}/DLexiconMObj.gf | 0 {gf => src}/DLexiconMObjEng.gf | 0 {gf => src}/DLexiconMObjLog.gf | 0 {gf => src}/FGrammar.gf | 0 {gf => src}/FGrammarEng.gf | 0 {gf => src}/FGrammarGer.gf | 0 {gf => src}/FGrammarI.gf | 0 {gf => src}/FGrammarLog.gf | 0 {gf => src}/FLexicon.gf | 0 {gf => src}/FLexiconEng.gf | 0 {gf => src}/FLexiconGer.gf | 0 {gf => src}/FLexiconLog.gf | 0 {gf => src}/MCats.gf | 0 {gf => src}/MCatsEng.gf | 0 {gf => src}/MCatsGer.gf | 0 {gf => src}/MCatsI.gf | 0 {gf => src}/MCatsLog.gf | 0 gf/MNlp.gf => src/Mgf.gf | 2 +- src/MgfEng.gf | 3 ++ src/MgfGer.gf | 3 ++ src/MgfLog.gf | 3 ++ {gf => src}/UtilsEng.gf | 0 {gf => src}/UtilsGer.gf | 0 36 files changed, 79 insertions(+), 62 deletions(-) delete mode 100644 gf/MNlpEng.gf delete mode 100644 gf/MNlpGer.gf delete mode 100644 gf/MNlpLog.gf rename {gf => src}/DGrammar.gf (100%) rename {gf => src}/DGrammarEng.gf (95%) rename {gf => src}/DGrammarGer.gf (86%) rename {gf => src}/DGrammarLog.gf (100%) rename {gf => src}/DLexicon.gf (100%) rename {gf => src}/DLexiconEng.gf (100%) rename {gf => src}/DLexiconGer.gf (92%) rename {gf => src}/DLexiconLog.gf (100%) rename {gf => src}/DLexiconMObj.gf (100%) rename {gf => src}/DLexiconMObjEng.gf (100%) rename {gf => src}/DLexiconMObjLog.gf (100%) rename {gf => src}/FGrammar.gf (100%) rename {gf => src}/FGrammarEng.gf (100%) rename {gf => src}/FGrammarGer.gf (100%) rename {gf => src}/FGrammarI.gf (100%) rename {gf => src}/FGrammarLog.gf (100%) rename {gf => src}/FLexicon.gf (100%) rename {gf => src}/FLexiconEng.gf (100%) rename {gf => src}/FLexiconGer.gf (100%) rename {gf => src}/FLexiconLog.gf (100%) rename {gf => src}/MCats.gf (100%) rename {gf => src}/MCatsEng.gf (100%) rename {gf => src}/MCatsGer.gf (100%) rename {gf => src}/MCatsI.gf (100%) rename {gf => src}/MCatsLog.gf (100%) rename gf/MNlp.gf => src/Mgf.gf (60%) create mode 100644 src/MgfEng.gf create mode 100644 src/MgfGer.gf create mode 100644 src/MgfLog.gf rename {gf => src}/UtilsEng.gf (100%) rename {gf => src}/UtilsGer.gf (100%) diff --git a/README.md b/README.md index 0c5e8dd..250cda4 100644 --- a/README.md +++ b/README.md @@ -1,75 +1,81 @@ -We want to use GF in order to parse mathematical documents. -Here is an overview of our current approach. -Note that the current implementation is merely a proof of concept and subject to constant change. Issues and discussions can also be found at <https://gl.kwarc.info/smglom/GF/issues>. +The Mathematical Grammatical Framework (MGF) +--- +MGF uses [GF](https://www.grammaticalframework.org/) in order to parse mathematical documents. +The project is still at an early stage. +Currently, the translation between English and German is supported for a small number of examples. +Furthermore, the sentences can be formalized in a logical representation. -Preprocessing + +Running the Examples === -Consider the following example sentence: +A number of examples can be found in `examples.gfs`. +You can run them directly with -``` -A positive integer $n$ is called prime iff there is no integer $1 < m < n$ such that $m \mid n$. +```bash +cat examples.gfs | gf --run ``` -This sentence (written in LaTeX) gets transformed into an html representation using LaTeXML. -We use our LLaMaPUn library to process this html representation and generate a (space-separated) token stream. -The formula representation is based directly on the corresponding presentation-MathML. -For the example sentence, this gives us the following token stream: +Alternatively, you can run them individually/test your own input. +After starting the GF command line, you first need to import the grammars: ``` -a positive integer $ mi( n ) $ is called prime iff there is no integer $ mrow( mn( 1 ) mo( < ) mi( m ) mo( < ) mi( n ) ) $ such that $ mrow( mi( m ) mo( | ) mi( n ) ) $``` +i src/MgfEng.gf +i src/MgfGer.gf +i src/MgfLog.gf ``` -The LLaMaPUn library allows us to map offsets in this string back to the nodes of the HTML representation. - - -Formula parsing -=== - -So far, the coverage for parsing formulae is tiny. -As an example, +Note that the German grammar (`MgfGer`) takes quite a while to get linked. +If you don't need it, feel free to skip it. +Afterwards, you can for example translate a sentence from English to German in the following way: ``` -mrow( mi( m ) mo( | ) mi( n ) ) +parse -lang=Eng "a unital quasigroup is called a loop" | l -lang=Ger ``` -matches the rule +A logical representation can be generated with the language `Log`: ``` -apply_bin_rel : FBinRelation -> FExpression -> FExpression -> FStatement; +parse -lang=Eng "a unital quasigroup is called a loop" | l -lang=Log ``` -The prefix "F" indicates that the categories act on the formula level. +It's also possible to view parse trees with the commands `vt` and `vp`: -A complete formula can be one of the following two categories: A `Statement` or a `MathCN`. -The `Statement` corresponds to a clause (`Cl`). The example above ($m \mid n$) would be an example for that. -A `MathCN` on the other hand corresponds to a noun. -In "A positive integer $n$", the $n$ would be such a `MathCN`. +``` +parse -lang=Eng "a unital quasigroup is called a loop" | vt +``` -Similarly, in "there is no integer $1 < m < n$", the formula also is a `MathCN`. It matches the rule: +it opens the parse tree with `open`. If this does not work, you can manually +specify a command to open the graphs. E.g. to open it with the +pdf viewer `zathura`, you need to call ``` -fexpr_fbinrel_fcid_fbinrel_fexpr_to_mathcn : -- introduces one identifier (like in $1 < n < 10$) - FExpression -> FBinRelation -> FComplexIdentifier -> FBinRelation -> FExpression -> MathCN; +parse -lang=Eng "a unital quasigroup is called a loop" | vt -view=zathura ``` -The `MathCN` concept is not good and we need to do more work on declarations (<https://gl.kwarc.info/smglom/GF/issues/2>). +Note that a png file is generated in the case of a single parse tree (which might require a different viewer). -Language/English parsing +Preprocessing === -The central categories on the language side are: `Statement`, `Definition`, `MObj`. -A `Statement` can be a formula (as described above) or a language statement. -An `MObj` is a "mathematical object", such as an "integer". -Often, identifiers are introduced in an apposition (as in "a positive integer $n$"). -This can be done using the following rule: +Consider the following example sentence: ``` -appo_mobj : MObj -> MathCN -> MObj; +A positive integer $n$ is called prime, iff there is no integer $1 < m < n$ such that $m \mid n$. ``` -As mentioned above, more work is needed on `MathCN`, but also `MObj` and declarations in general (<https://gl.kwarc.info/smglom/GF/issues/2>). +This sentence (written in LaTeX) gets transformed into an html representation using [LaTeXML](https://github.com/brucemiller/LaTeXML). +We use our [LLaMaPUn library](https://github.com/kwarc/llamapun) to process this html representation and generate a (space-separated) token stream. +The formula representation is based directly on the corresponding presentation-MathML. +For the example sentence, this gives us the following token stream: + +``` +a positive integer $ mi( n ) $ is called prime , iff there is no integer $ mrow( mn( 1 ) mo( < ) mi( m ) mo( < ) mi( n ) ) $ such that $ mrow( mi( m ) mo( | ) mi( n ) ) $ +``` + +The LLaMaPUn library allows us to map offsets in this string back to the nodes of the HTML representation. + Lexica diff --git a/examples.gfs b/examples.gfs index efbc9c2..6e47d3d 100644 --- a/examples.gfs +++ b/examples.gfs @@ -1,5 +1,15 @@ -i gf/MNlpEng.gf -i gf/MNlpLog.gf -parse "a positive integer $ mi( n ) $ is called prime iff there is no integer $ mrow( mn( 1 ) mo( < ) mi( m ) mo( < ) mi( n ) ) $ such that $ mrow( mi( m ) mo( | ) mi( n ) ) $" | l -lang=Log -parse "a set is called empty iff it is the empty set" -parse "an alphabet $ mi( A ) $ is a finite set" +-- import grammars. For the German grammar, linking can take quite a while. +i src/MgfEng.gf +i src/MgfGer.gf +i src/MgfLog.gf + +-- translate prime definition to German +p -lang=Eng "a positive integer $ mi( n ) $ is called prime , iff there is no integer $ mrow( mn( 1 ) mo( < ) mi( m ) mo( < ) mi( n ) ) $ such that $ mrow( mi( m ) mo( | ) mi( n ) ) $" | l -lang=Ger +-- translate prime definition to logical representation +p -lang=Eng "a positive integer $ mi( n ) $ is called prime , iff there is no integer $ mrow( mn( 1 ) mo( < ) mi( m ) mo( < ) mi( n ) ) $ such that $ mrow( mi( m ) mo( | ) mi( n ) ) $" | l -lang=Log + +-- parse tree of loop definition (pipe it into vp for visualization) +p -lang=Eng "a unital quasigroup is called a loop" + +-- translate a statement from German to English +p -lang=Ger "es gibt eine gerade ganze Zahl" | l -lang=Eng diff --git a/gf/MNlpEng.gf b/gf/MNlpEng.gf deleted file mode 100644 index 434b398..0000000 --- a/gf/MNlpEng.gf +++ /dev/null @@ -1,3 +0,0 @@ -concrete MNlpEng of MNlp = DLexiconEng, DGrammarEng, FGrammarEng, FLexiconEng ** { - -} diff --git a/gf/MNlpGer.gf b/gf/MNlpGer.gf deleted file mode 100644 index 7ce2593..0000000 --- a/gf/MNlpGer.gf +++ /dev/null @@ -1,3 +0,0 @@ -concrete MNlpGer of MNlp = DLexiconGer, DGrammarGer, FGrammarGer, FLexiconGer ** { - -} diff --git a/gf/MNlpLog.gf b/gf/MNlpLog.gf deleted file mode 100644 index d144c32..0000000 --- a/gf/MNlpLog.gf +++ /dev/null @@ -1,3 +0,0 @@ -concrete MNlpLog of MNlp = DLexiconLog, DGrammarLog, FGrammarLog, FLexiconLog ** { - -} diff --git a/gf/DGrammar.gf b/src/DGrammar.gf similarity index 100% rename from gf/DGrammar.gf rename to src/DGrammar.gf diff --git a/gf/DGrammarEng.gf b/src/DGrammarEng.gf similarity index 95% rename from gf/DGrammarEng.gf rename to src/DGrammarEng.gf index a863c98..f07a8b7 100644 --- a/gf/DGrammarEng.gf +++ b/src/DGrammarEng.gf @@ -40,8 +40,8 @@ concrete DGrammarEng of DGrammar = MCatsEng, GrammarEng, ExtraEng ** open Syntax iff_definition defi condition = lin S { s = defi.s ++ ("," | "") ++ "iff" ++ condition.s }; -- better solution for this? -- def_defmobj_defmobj a b = mkCl a b; def_mobj_is_mobj definiendum id definiens = - mkS (mkCl (DetCN aSg_Det (mobjToCN definiens id)) (mobjToCN definiendum)) -- a unitary quasigroup is a loop - | mkS (mkCl (DetCN aSg_Det (mobjToCN definiens id)) -- a unitary quasigroup is called a loop + -- mkS (mkCl (DetCN aSg_Det (mobjToCN definiens id)) (mobjToCN definiendum)) -- a unitary quasigroup is a loop + mkS (mkCl (DetCN aSg_Det (mobjToCN definiens id)) -- a unitary quasigroup is called a loop (PastPartAP (mkVPSlash call_V3 (DetCN aSg_Det (mobjToCN definiendum))))); -- mkS (mkCl (DetCN {- mkNP is for some reason ambiguous -} -- aSg_Det (appo_mobj definiendum representative)) definiens); diff --git a/gf/DGrammarGer.gf b/src/DGrammarGer.gf similarity index 86% rename from gf/DGrammarGer.gf rename to src/DGrammarGer.gf index 9ba9214..488d47f 100644 --- a/gf/DGrammarGer.gf +++ b/src/DGrammarGer.gf @@ -1,5 +1,6 @@ concrete DGrammarGer of DGrammar = MCatsGer, GrammarGer** open SyntaxGer, ParadigmsGer, UtilsGer, ResGer, ParamX, Prelude in { oper + call_V2 : V2 = mkV2 (ParadigmsGer.mkV "heißen"); call_V2A : V2A = mkV2A (ParadigmsGer.mkV "heißen"); call_V3 : V3 = mkV3 (ParadigmsGer.mkV "heißen"); exist_V2 : V2 = mkV2 (ParadigmsGer.mkV ("geben" | "existieren")); @@ -29,10 +30,10 @@ concrete DGrammarGer of DGrammar = MCatsGer, GrammarGer** open SyntaxGer, Paradi prop); -- (mkVP (mkVPSlash call_V2A prop))); -- TODO: FIX THIS!!! iff_definition defi condition = lin S { s = \\order => (defi.s!order) ++ "genau dann , wenn" ++ condition.s!Sub }; -- better solution for this? - def_mobj_is_mobj definiendum id definiens = - mkS (mkCl (DetCN aSg_Det (mobjToCN definiens id)) (mobjToCN definiendum)) -- a unitary quasigroup is a loop - | mkS (mkCl (DetCN aSg_Det (mobjToCN definiens id)) -- a unitary quasigroup is called a loop - (PastPartAP (mkVPSlash call_V3 (DetCN aSg_Det (mobjToCN definiendum))))); +-- def_mobj_is_mobj definiendum id definiens = + -- mkS (mkCl (DetCN aSg_Det (mobjToCN definiens id)) (mobjToCN definiendum)) -- a unitary quasigroup is a loop +-- mkS (mkCl (DetCN aSg_Det (mobjToCN definiens id)) -- a unitary quasigroup is called a loop +-- (mkVp call_V2 (DetCN aSg_Det (mobjToCN definiendum)))); utter_statement s = mkUtt s; utter_declaration s = mkUtt s; diff --git a/gf/DGrammarLog.gf b/src/DGrammarLog.gf similarity index 100% rename from gf/DGrammarLog.gf rename to src/DGrammarLog.gf diff --git a/gf/DLexicon.gf b/src/DLexicon.gf similarity index 100% rename from gf/DLexicon.gf rename to src/DLexicon.gf diff --git a/gf/DLexiconEng.gf b/src/DLexiconEng.gf similarity index 100% rename from gf/DLexiconEng.gf rename to src/DLexiconEng.gf diff --git a/gf/DLexiconGer.gf b/src/DLexiconGer.gf similarity index 92% rename from gf/DLexiconGer.gf rename to src/DLexiconGer.gf index 15e3b1a..45ae0a3 100644 --- a/gf/DLexiconGer.gf +++ b/src/DLexiconGer.gf @@ -4,7 +4,7 @@ concrete DLexiconGer of DLexicon = MCatsGer ** open SyntaxGer, ParadigmsGer, Res -- set_MObj = mkCN (mkN "set"); -- alphabet_MObj = mkCN (mkN "alphabet"); quasigroup_MObj = mkMObj (ParadigmsGer.mkN "Quasigruppe"); - loop_MObj = mkMObj (ParadigmsGer.mkN "Loop"); + loop_MObj = mkMObj (ParadigmsGer.mkN "Loop" Fem); integer_MObj = mkMObj (ParadigmsGer.mkN "ganze Zahl" Fem); finite_MObjProp = mkAP (ParadigmsGer.mkA "endlich"); diff --git a/gf/DLexiconLog.gf b/src/DLexiconLog.gf similarity index 100% rename from gf/DLexiconLog.gf rename to src/DLexiconLog.gf diff --git a/gf/DLexiconMObj.gf b/src/DLexiconMObj.gf similarity index 100% rename from gf/DLexiconMObj.gf rename to src/DLexiconMObj.gf diff --git a/gf/DLexiconMObjEng.gf b/src/DLexiconMObjEng.gf similarity index 100% rename from gf/DLexiconMObjEng.gf rename to src/DLexiconMObjEng.gf diff --git a/gf/DLexiconMObjLog.gf b/src/DLexiconMObjLog.gf similarity index 100% rename from gf/DLexiconMObjLog.gf rename to src/DLexiconMObjLog.gf diff --git a/gf/FGrammar.gf b/src/FGrammar.gf similarity index 100% rename from gf/FGrammar.gf rename to src/FGrammar.gf diff --git a/gf/FGrammarEng.gf b/src/FGrammarEng.gf similarity index 100% rename from gf/FGrammarEng.gf rename to src/FGrammarEng.gf diff --git a/gf/FGrammarGer.gf b/src/FGrammarGer.gf similarity index 100% rename from gf/FGrammarGer.gf rename to src/FGrammarGer.gf diff --git a/gf/FGrammarI.gf b/src/FGrammarI.gf similarity index 100% rename from gf/FGrammarI.gf rename to src/FGrammarI.gf diff --git a/gf/FGrammarLog.gf b/src/FGrammarLog.gf similarity index 100% rename from gf/FGrammarLog.gf rename to src/FGrammarLog.gf diff --git a/gf/FLexicon.gf b/src/FLexicon.gf similarity index 100% rename from gf/FLexicon.gf rename to src/FLexicon.gf diff --git a/gf/FLexiconEng.gf b/src/FLexiconEng.gf similarity index 100% rename from gf/FLexiconEng.gf rename to src/FLexiconEng.gf diff --git a/gf/FLexiconGer.gf b/src/FLexiconGer.gf similarity index 100% rename from gf/FLexiconGer.gf rename to src/FLexiconGer.gf diff --git a/gf/FLexiconLog.gf b/src/FLexiconLog.gf similarity index 100% rename from gf/FLexiconLog.gf rename to src/FLexiconLog.gf diff --git a/gf/MCats.gf b/src/MCats.gf similarity index 100% rename from gf/MCats.gf rename to src/MCats.gf diff --git a/gf/MCatsEng.gf b/src/MCatsEng.gf similarity index 100% rename from gf/MCatsEng.gf rename to src/MCatsEng.gf diff --git a/gf/MCatsGer.gf b/src/MCatsGer.gf similarity index 100% rename from gf/MCatsGer.gf rename to src/MCatsGer.gf diff --git a/gf/MCatsI.gf b/src/MCatsI.gf similarity index 100% rename from gf/MCatsI.gf rename to src/MCatsI.gf diff --git a/gf/MCatsLog.gf b/src/MCatsLog.gf similarity index 100% rename from gf/MCatsLog.gf rename to src/MCatsLog.gf diff --git a/gf/MNlp.gf b/src/Mgf.gf similarity index 60% rename from gf/MNlp.gf rename to src/Mgf.gf index 3a7e564..3fc0e11 100644 --- a/gf/MNlp.gf +++ b/src/Mgf.gf @@ -1,5 +1,5 @@ -- Collects all the stuff we have for parsing math -abstract MNlp = DLexicon, DGrammar, FGrammar, FLexicon ** { +abstract Mgf = DLexicon, DGrammar, FGrammar, FLexicon ** { flags startcat = Utterance; } diff --git a/src/MgfEng.gf b/src/MgfEng.gf new file mode 100644 index 0000000..403224f --- /dev/null +++ b/src/MgfEng.gf @@ -0,0 +1,3 @@ +concrete MgfEng of Mgf = DLexiconEng, DGrammarEng, FGrammarEng, FLexiconEng ** { + +} diff --git a/src/MgfGer.gf b/src/MgfGer.gf new file mode 100644 index 0000000..11666d7 --- /dev/null +++ b/src/MgfGer.gf @@ -0,0 +1,3 @@ +concrete MgfGer of Mgf = DLexiconGer, DGrammarGer, FGrammarGer, FLexiconGer ** { + +} diff --git a/src/MgfLog.gf b/src/MgfLog.gf new file mode 100644 index 0000000..72f121f --- /dev/null +++ b/src/MgfLog.gf @@ -0,0 +1,3 @@ +concrete MgfLog of Mgf = DLexiconLog, DGrammarLog, FGrammarLog, FLexiconLog ** { + +} diff --git a/gf/UtilsEng.gf b/src/UtilsEng.gf similarity index 100% rename from gf/UtilsEng.gf rename to src/UtilsEng.gf diff --git a/gf/UtilsGer.gf b/src/UtilsGer.gf similarity index 100% rename from gf/UtilsGer.gf rename to src/UtilsGer.gf -- GitLab