Commit 839dca77 authored by Ulrich's avatar Ulrich

bug fixes and code cleanup

parent e9ad788a
......@@ -582,248 +582,6 @@ def process_annotation(html_soup, rdf_soup, a):
# div.append(math)
# node.replace_with(div)
def process_math_nodes(html_soup, annotations):
math_elements = html_soup.find_all('math')
for math_element in math_elements:
relevant_nodes = []
relevant_list = list()
for annotation in annotations:
if annotation['container'].startswith(math_element['id']):
annotation_nodes = get_annotations_nodes(html_soup, annotation)
true_for_all = True
for node in annotation_nodes:
true_for_all = true_for_all and contains_as_child(math_element, node, False)
if true_for_all:
relevant_nodes.append(annotation_nodes)
for node in annotation_nodes:
relevant_list.append(node)
if len(relevant_nodes) == 0:
continue
print("\n new math")
print(math_element['id'])
for nodes in relevant_nodes:
for node in nodes:
print(node['id'])
print("")
semantics = next(math_element.children)
pres_mathml = next(semantics.children)
if contains_frac(pres_mathml.children):
continue
print("relevant nodes")
for node in relevant_list:
print(node['id'])
print(math_element.prettify())
new_math_nodes = []
first_outer = True
last = None
for nodes in relevant_nodes:
first_inner = True
new_math = html_soup.new_tag("math")
for node in nodes:
if first_inner and not first_outer:
print("last id " + last['id'])
print("current id " + node['id'])
result = extract(last, node)
result.remove(last)
print("extraction")
print(result)
extracted_math = html_soup.new_tag("math")
added = False
for r in result:
extracted_math.append(r)
added = True
if added:
new_math_nodes.append(extracted_math)
#extract everything from last to the current node
last = node
first_inner = False
#extract everything from last till the end of the formula
#new_math_nodes.append(new_math)
first_outer = False
print("current math")
print(math_element.prettify())
result = extract(last, None)
if last in result:
result.remove(last)
last_math = html_soup.new_tag("math")
added = False
for r in result:
last_math.append(r)
added = True
if added:
new_math_nodes.append(last_math)
new_math_nodes_copy = [math_element]
i = 0
for nodes in relevant_nodes:
new_math = html_soup.new_tag("math")
for node in nodes:
node.extract()
new_math.append(node)
new_math_nodes_copy.append(new_math)
new_math_nodes_copy.append(new_math_nodes[i])
i = i + 1
while i < len(new_math_nodes):
new_math_nodes_copy.append(new_math_nodes[i])
i = i + 1
new_math_nodes_copy.reverse()
for math in new_math_nodes_copy:
math_element.insert_after(math)
#print("Result:")
#print(math_element.prettify())
#for math in new_math_nodes_copy:
# print(math.prettify())
# print("dfs")
# first_non_relevant = dfs(pres_mathml, relevant_list, False)
# print(first_non_relevant)
# new_math = html_soup.new_tag("math")
# for node in relevant_list:
# node.extract()
# new_math.append(node)
# print("old math elem")
# print(math_element)
# print("new math elem")
# print(new_math)
#
# rest_of_old_math = html_soup.new_tag("math");
# get_non_relevant(first_non_relevant)
# print("rest of old math")
def contains_frac(kids):
kid = next(kids, None)
if isinstance(kid,NavigableString):
return False
while kid != None:
if kid.name == "mfrac":
return True
if contains_frac(kid.children):
return True
kid = next(kids, None)
return False
def extract(start, end):
print("start")
print(start)
if start == end:
return []
if start.name == "annotation-xml":
return []
if start.name == "mrow":
start = next(start.children, None)
if start == None:
return []
res = []
if start == end:
return []
else:
res += [start]
sib = start.next_sibling
while sib == None:
start = start.parent
sib = start.next_sibling
res = res + extract(sib, end)
print("return")
print(res)
return res
#if sib != None:
# return res + extract(sib, end)
#else:
# parent = start.parent
# while parent.next_sibling == None:
# parent = parent.parent
# return extract(parent.next_sibling, end)
def dfs(current, relevant_nodes, started):
print("")
print(current)
if current == None:
return None
if isinstance(current, NavigableString):
return None;
if current.name == "annotation-xml":
return None
print("current " + str(started) + " " + str(current in relevant_nodes))
print(current['id'])
if current in relevant_nodes:
print("case 1")
ret = dfs(current.next_sibling, relevant_nodes, True)
# current.extract()
#extract current
if ret == None:
return True
elif started:
print("case 2")
return current
else:
print("case 3")
first_child = next(current.children, None)
ret = dfs(first_child, relevant_nodes, started)
if ret == True:
return dfs(current.next_sibling, relevant_nodes, True)
else:
print("ret")
if ret != None:
return ret
return dfs(current.next_sibling, relevant_nodes, started)
def process(html_soup, rdf_soup, output_path):
#remove the jobad header
......
......@@ -365,8 +365,11 @@ pub fn numeric_unit_cnml_spotter(opt_node : Option<Node>, document : &Document,
res.push(amb);
found = true;
}
}else if kids[0].get_name().eq("csymbol") && kids[0].get_all_content().eq("subscript"){
return res;
}
if !found {
let mut r1 = numeric_unit_cnml_spotter(node.get_first_child(), document, config);
res.append(&mut r1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment