Commit 67057736 authored by jfschaefer's avatar jfschaefer
Browse files

added tex2gf script (moved from gl.mathhub.info/teaching/LBS)

parent 697e10d0
#!/usr/local/bin/python3
"""
Harvests words from tex glossary files and writes them into gf files.
Note: We have functionality specialized for the abstract grammar
and for the concrete grammars.
As we want to keep everything in a single file, these parts
are prefixed with a_ (or A_) and with c_ (or C_) accordingly.
"""
import re
import sys
import os
def parse(string, regexes):
"""
Assumes that regexes is a list of pairs (regex, token_type).
Returns tokens from a string as pairs (match, token_type),
sorted according to the match start.
"""
tokens = []
for (regex, token_type) in regexes:
tokens += [(match, token_type) for match in re.finditer(regex, string)]
return sorted(tokens, key = lambda e : e[0].start())
def get_params(param_str):
if param_str == None:
return { }
return {
e[0] : "=".join(e[1:])
for e in [p.split("=") for p in param_str.split(",") if "=" in p]
}
#
# Code for the abstract file (prefixed with A_ or a_)
#
A_TOKEN_BEGIN_MODSIG = 0
A_TOKEN_GIMPORT = 1
A_TOKEN_END_MODSIG = 2
A_TOKEN_SYM = 3
A_TOKEN_SYMDEF = 4
a_re_begin_modsig = re.compile(
r"\\begin\s*\{modsig\}\s*"
r"(?:\[[^\]]*\])?\s*" # optional parameters
r"\{(?P<name>[\w-]+)\}" # name
)
a_re_gimport = re.compile(
r"\\gimport\*?\s*"
r"(?:\[[^\]]*\])?\s*" # optional parameters
r"\{(?P<name>[\w-]+)\}" # name
)
a_re_end_modsig = re.compile(
r"\\end\s*\{modsig\}"
)
a_re_sym = re.compile(
r"\\sym(?:i|ii|iii|iv)\*?s*"
r"(?:\[(?P<params>[^\]]*)\])?\s*" # parameters
r"\{(?P<arg0>[a-zA-Z0-9-]+)\}" # arg0
r"(?:\s*\{(?P<arg1>[a-zA-Z0-9-]+)\})?" # arg1
r"(?:\s*\{(?P<arg2>[a-zA-Z0-9-]+)\})?" # arg2
r"(?:\s*\{(?P<arg3>[a-zA-Z0-9-]+)\})?" # arg3
)
a_re_symdef = re.compile(
r"\\symdef\*?s*"
r"(?:\[(?P<params>[^\]]*)\])?\s*" # parameters
r"\{(?P<name>[a-zA-Z0-9-]+)\}" # name
)
a_regexes = [
(a_re_begin_modsig, A_TOKEN_BEGIN_MODSIG),
(a_re_gimport, A_TOKEN_GIMPORT),
(a_re_end_modsig, A_TOKEN_END_MODSIG),
(a_re_sym, A_TOKEN_SYM),
(a_re_symdef, A_TOKEN_SYMDEF),
]
def a_harvest(string, log_warn):
""" Returns the module name, the gimports and the syms from the string """
# states - some of them just for error checking
found_modsig = False
in_modsig = False
mod_name = None
gimports = []
syms = []
for (match, token_type) in parse(string, a_regexes):
if token_type == A_TOKEN_BEGIN_MODSIG:
if found_modsig or in_modsig:
raise Exception(f"Didn't expect '{match.group(0)}'")
found_modsig = True
in_modsig = True
mod_name = match.group("name")
elif token_type == A_TOKEN_GIMPORT:
if not in_modsig:
raise Exception("Warning: Unexpected gimport outside of modsig")
gimports.append(match.group("name"))
elif token_type == A_TOKEN_END_MODSIG:
if not in_modsig:
raise Exception(f"Didn't expect '{match.group(0)}'")
in_modsig = False
elif token_type == A_TOKEN_SYM:
if not in_modsig:
raise Exception(f"Didn't expect '{match.group(0)}'")
params = get_params(match.group("params"))
if "gfc" not in params:
log_warn(f"Missing gfc in '{match.group(0)}' - skipping entry")
continue
args = [match.group(x) for x in ["arg0", "arg1", "arg2", "arg3"]]
syms.append(("_".join([arg for arg in args if arg != None]), params["gfc"]))
elif token_type == A_TOKEN_SYMDEF:
if not in_modsig:
raise Exception(f"Didn't expect '{match.group(0)}'")
params = get_params(match.group("params"))
if "gfc" not in params:
log_warn(f"Missing gfc in '{match.group(0)}' - skipping entry")
continue
name = match.group("name")
if "name" in params:
name = params["name"] # if it set explicitly, use that one instead
syms.append((name, params["gfc"]))
else:
assert False
if not found_modsig:
raise Exception("Didn't find \\begin{modsig}")
if in_modsig:
raise Exception("Missing \\end{modsig}")
return (mod_name, gimports, syms)
def a_generate(mod_name, gimports, syms, target):
""" Writes an abstract gf file to target """
target.write("--# -path=.:lib/prelude:lib/alltenses\n\n")
target.write(f"abstract {mod_name} = ")
if len(gimports) == 0:
target.write("open Cat in {\n")
target.write(" flags startcat = S;\n")
else:
target.write(", ".join(gimports) + " ** {\n")
target.write(" fun\n")
for (name, type_) in syms:
target.write(f" {name} : {type_};\n")
target.write("}\n")
#
# Code for the concrete file (prefixed with C_ or c_)
#
C_TOKEN_BEGIN_MHMODNL = 0
C_TOKEN_END_MHMODNL = 1
C_TOKEN_DEF = 2
c_re_begin_mhmodnl = re.compile(
r"\\begin\s*"
r"\{mhmodnl\}\s*"
r"(?:\[[^\]]*\])?\s*" # optional parameters
r"\{(?P<name>[\w-]+)\}\s*" # name
r"\{(?P<lang>[\w-]+)\}" # lang
)
c_re_end_mhmodnl = re.compile(
r"\\end\s*\{mhmodnl\}"
)
c_re_def = re.compile(
r"\\def(?:i|ii|iii|iv)\s*"
r"(?:\[(?P<params>[^\]]*)\])?\s*" # parameters
r"\{(?P<arg0>[a-zA-Z0-9-]+)\}" # arg0
r"(?:\s*\{(?P<arg1>[a-zA-Z0-9-]+)\})?" # arg1
r"(?:\s*\{(?P<arg2>[a-zA-Z0-9-]+)\})?" # arg2
r"(?:\s*\{(?P<arg3>[a-zA-Z0-9-]+)\})?" # arg3
)
c_regexes = [
(c_re_begin_mhmodnl, C_TOKEN_BEGIN_MHMODNL),
(c_re_end_mhmodnl, C_TOKEN_END_MHMODNL),
(c_re_def, C_TOKEN_DEF),
]
def c_harvest(string, log_warn):
""" Returns the module name, the language and the defs from the string """
# states - some of them just for error checking
found_mhmodnl = False
in_mhmodnl = False
mod_name = None
lang = None
defs = []
for (match, token_type) in parse(string, c_regexes):
if token_type == C_TOKEN_BEGIN_MHMODNL:
if found_mhmodnl or in_mhmodnl:
raise Exception(f"Didn't expect '{match.group(0)}'")
found_mhmodnl = True
in_mhmodnl = True
mod_name = match.group("name")
lang = match.group("lang")
elif token_type == C_TOKEN_END_MHMODNL:
if not in_mhmodnl:
raise Exception(f"Didn't expect '{match.group(0)}'")
in_mhmodnl = False
elif token_type == C_TOKEN_DEF:
if not in_mhmodnl:
raise Exception(f"Didn't expect '{match.group(0)}'")
params = get_params(match.group("params"))
args = [match.group(x) for x in ["arg0", "arg1", "arg2", "arg3"]]
args = [arg for arg in args if arg != None]
name = params["name"] if "name" in params else "_".join(args)
val = " ".join(args)
if "gfl" in params:
defs.append((name, params["gfl"]))
continue
if "gfa" not in params:
log_warn(f"Missing gfa or gfl in '{match.group(0)}' - skipping entry")
continue
cons = params["gfa"]
defs.append((name, f"{cons} \"{val}\""))
else:
assert False
if not found_mhmodnl:
raise Exception("Didn't find \\begin{mhmodnl}")
if in_mhmodnl:
raise Exception("Missing \\end{mhmodnl}")
return (mod_name, lang, defs)
def c_generate(mod_name, lang, gimports, defs, target):
""" Writes concrete gf file to target """
lang = lang.upper()
target.write("--# -path=.:lib/prelude:lib/alltenses\n\n")
target.write(f"concrete {mod_name}{lang} of {mod_name} = ")
if len(gimports) == 0:
target.write("open SyntaxEng, ParadigmsEng in {\n")
else:
target.write(", ".join([g + lang for g in gimports]) + " ** {\n")
target.write(" lin\n")
for (name, expr) in defs:
target.write(f" {name} = {expr};\n")
target.write("}\n")
#
# Main code
#
def convert(source_directory, name, target_directory):
""" converts tex files for concept name in source_directory to gf files in target_directory """
log_warn = lambda file_name : lambda message : print(f"{file_name}: Warning: {message}", file=sys.stderr)
# harvest data for abstract grammar
with open(os.path.join(source_directory, name + ".tex"), "r") as a_in_file:
(a_mod_name, a_gimports, a_syms) = a_harvest(a_in_file.read(), log_warn(f"{name}.tex"))
if a_mod_name != name:
log_warn(f"{name}.tex")("mod name is {a_mod_name}")
# generate abstract grammar
with open(os.path.join(target_directory, name + ".gf"), "w") as a_out_file:
a_generate(a_mod_name, a_gimports, a_syms, a_out_file)
# determine languages for concrete grammar
regex = re.compile(name + r"\.(?P<lang>[a-zA-Z]+)\.tex")
langs = []
for file_name in os.listdir(source_directory):
m = regex.match(file_name)
if m != None:
langs.append(m.group("lang"))
for lang in langs:
# harvest data
with open(os.path.join(source_directory, f"{name}.{lang}.tex"), "r") as c_in_file:
(c_mod_name, c_mod_lang, c_defs) = c_harvest(c_in_file.read(), log_warn(f"{name}.{lang}.tex"))
# check for conflicts
if c_mod_name != a_mod_name:
log_warn(f"{name}.{lang}.tex")(f"mod name is '{c_mod_name} but is '{a_mod_name}' in {name}.tex")
if c_mod_lang != lang:
log_warn(f"{name}.{lang}.tex")(f"language is '{c_mod_lang}'")
a_syms_names = [e[0] for e in a_syms]
for def_ in c_defs:
if def_[0] not in a_syms_names:
log_warn(f"{name}.{lang}.tex")(f"found '{def_[0]}', but is not in {name}.tex")
c_defs_names = [e[0] for e in c_defs]
for sym in a_syms:
if sym[0] not in c_defs_names:
log_warn(f"{name}.{lang}.tex")(f"'{sym[0]}' not defined")
with open(os.path.join(target_directory, f"{name}{lang.upper()}.gf"), "w") as c_out_file:
c_generate(c_mod_name, c_mod_lang, a_gimports, c_defs, c_out_file)
if len(sys.argv) != 4:
print("Usage: tex2gf {SOURCE_DIR} {NAME} {TARGET_DIR}")
print("Example: tex2gf smglong/sets/source emptyset /tmp")
else:
convert(sys.argv[1], sys.argv[2], sys.argv[3])
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment