Commit 267fab09 authored by Ulrich's avatar Ulrich

new scorer for references + bugfix in javascript

parent 839dca77
......@@ -443,8 +443,8 @@ def process_annotation(html_soup, rdf_soup, a):
math = semantics_copy.parent
div = html_soup.new_tag("div")
div['aria-label'] = label
div['role'] = "math"
div['aria-label'] = label
div['style'] = "display:inline"
math.replace_with(div)
......@@ -456,9 +456,10 @@ def process_annotation(html_soup, rdf_soup, a):
return
else:
#textual or only partly math
div = html_soup.new_tag("div")
# div['role'] = "none"
div['aria-label'] = label
div['role'] = "math"
div['style'] = "display:inline"
for node in annotation_nodes:
......
......@@ -605,16 +605,23 @@ highlight = false;
// var pre = found_array[j].prefix;
// var from = found_array[j].lookup;
//
console.log(found_array[j]);
console.log(conv.toString());
if (pre != ""){
var factor = prefixes_map[pre];
//conv = conv * Math.pow(10,parseFloat(factor));
var pow = new Big("10").pow(parseFloat(factor));
pow = pow.pow(found_array[j].exp);
// pow = pow.pow(found_array[j].exp);
conv = conv.times(pow);
conv = conv.pow(found_array[j].exp)
}
console.log(conv.toString());
// console.log(pre + from + " = " + conv + " " + to);
var content_mathml = annotations[i].mathml[0].childNodes[2];
......@@ -634,6 +641,10 @@ highlight = false;
}
console.log(number + " " + pre + from + " = " + new_number + " " + to);
if (found_array[j].exp != 1){
to = to + "^" + found_array[j].exp.toString();
}
create_pres_mathml(new_number, res, found_array[j], to, annotations[i]);
......
......@@ -90,3 +90,4 @@ yr year
yrs year
years year
arcsec arcsec
arcmin arcmin
......@@ -11,26 +11,29 @@ use spotter_lib::util::*;
use spotter_lib::data::*;
use libxml::tree::*;
use libxml::parser::Parser;
use llamapun::data::Corpus;
use llamapun::data::Document as lDoc;
/* This adds scores to ambiguities of an annotation. The higher the score, the more likely the annotation. */
pub fn main() {
let args : Vec<_> = env::args().collect();
if args.len() <= 1{
println!("expecting the input dir/file as an argument");
if args.len() <= 2{
println!("usage: cargo run --bin scoring AnnotationPath DocumentPath");
return;
}
let config = parse_config("config.txt");
let input_path = args[1].to_owned();
let document_path = args[2].to_owned();
search_dir(Path::new(&input_path), &config);
search_dir(Path::new(&input_path), Path::new(&input_path), Path::new(&document_path), &config);
}
pub fn search_dir(dir_path : &Path, config : &Config){
pub fn search_dir(dir_path : &Path, old_path : &Path, document_path : &Path, config : &Config){
let opt_path = fs::read_dir(&dir_path);
if opt_path.is_err(){
......@@ -46,19 +49,19 @@ pub fn search_dir(dir_path : &Path, config : &Config){
let name = p.display();
if entry.metadata().is_ok() && entry.metadata().unwrap().is_dir(){
search_dir(&p, config);
search_dir(&p, old_path, document_path, config);
}else{
let string = format!("{}",name);
if string.ends_with(".kat.xml") {
println!("Scoring {}", name);
read_document(&string, config);
read_document(&string, old_path, document_path, config);
}
}
}
}
pub fn read_document(s : &str, config : &Config){
pub fn read_document(s : &str, old_path : &Path, document_path : &Path, config : &Config){
let parser = Parser::default();
let opt_doc = parser.parse_file(s);
if opt_doc.is_err(){
......@@ -67,11 +70,30 @@ pub fn read_document(s : &str, config : &Config){
}
let doc = opt_doc.unwrap();
let root = doc.get_root_element().unwrap();
find_content(&root, &doc, config);
//open also the other document here
//deduct the length of old_path from s and attach the rest to document_path the get the original document
let old_str = format!("{}", old_path.display());
let offset = &s[old_str.len()..];
let mut original_document = format!("{}/{}",document_path.display(), offset);
original_document = original_document.replace(".kat.xml",".html");
println!("loading document {}", original_document);
let corpus = Corpus::new(original_document.clone());
let opt_orig_doc = corpus.load_doc(original_document.clone());
if opt_orig_doc.is_err(){
println!("Could not open file {}", original_document.clone());
return;
}
let orig_doc = opt_orig_doc.unwrap();
find_content(&root, &doc, &orig_doc, config);
doc.save_file(&s).unwrap();
}
fn find_content(node : &Node, document : &Document, config : &Config) {
fn find_content(node : &Node, annotation_document : &Document, original_document : &lDoc, config : &Config) {
let mut cnml = false;
match node.get_type().unwrap() {
NodeType::ElementNode => {
......@@ -82,24 +104,59 @@ fn find_content(node : &Node, document : &Document, config : &Config) {
let mut vec = Vec::new();
let mut opt_score = None;
while c.is_some(){
let c_unw = c.unwrap();
if c_unw.get_name().eq("contentmathml"){
let cc = c_unw.get_first_child();
if let Some(child) = cc {
let qe = parse_cnml(&child, document).unwrap();
if opt_score.is_some(){
c_unw.remove_property_with_name("score");
c_unw.add_property("score", opt_score.unwrap());
let inv_score = 1.0 / score_annotation(&qe);
let as_score = bad_score_for_as(&qe);
}else {
if let Some(child) = cc {
let qe = parse_cnml(&child, annotation_document).unwrap();
c_unw.remove_property_with_name("score");
let inv_score = 1.0 / score_annotation(&qe);
let as_score = bad_score_for_as(&qe);
let new_score = if as_score < 0.0 { as_score } else { inv_score };
c_unw.add_property("score", &format!("{}", new_score));
c_unw.remove_property_with_name("score");
let new_score = if as_score < 0.0 { as_score } else { inv_score };
c_unw.add_property("score", &format!("{}", new_score));
vec.push(qe);
}
}
}else if c_unw.get_name().eq("annotates"){
//eliminate annotations when they occur in the bibliography section
let url = c_unw.get_property("resource").unwrap();
// let container = url.split("'").nth(1).unwrap();
let start = url.split("'").nth(3).unwrap();
// let end = url.split("'").nth(5).unwrap();
let query = format!("//*[@id='{}']", start);
let res = original_document.xpath_context.evaluate(&query);
let vec = res.unwrap().get_nodes_as_vec();
if vec.len() != 1{
println!("found multiple or no node for query {}", &query);
}else{
let mut first_node = vec[0].clone();
while let Some(parent) = first_node.get_parent(){
let class = parent.get_class_names();
if parent.get_name().eq("section") && parent.get_class_names().contains("ltx_bibliography"){
opt_score = Some("-1.0");
break;
}
first_node = parent;
}
vec.push(qe);
}
}
......@@ -110,7 +167,7 @@ fn find_content(node : &Node, document : &Document, config : &Config) {
let opt_new_qe = add_meaning_for_as(&vec);
if let Some(mut new_qe) = opt_new_qe{
let new_node = Node::new("kat:contentmathml", None, document).unwrap();
let new_node = Node::new("kat:contentmathml", None, annotation_document).unwrap();
new_node.add_property("rdf:parseType","Literal");
let parser = Parser::default();
......@@ -136,7 +193,7 @@ fn find_content(node : &Node, document : &Document, config : &Config) {
if !cnml {
let mut c: Option<Node> = node.get_first_child();
while let Some(child) = c {
find_content(&child, document, config);
find_content(&child, annotation_document, original_document, config);
c = child.get_next_sibling();
}
}
......
......@@ -365,8 +365,24 @@ pub fn numeric_unit_cnml_spotter(opt_node : Option<Node>, document : &Document,
res.push(amb);
found = true;
}
}else if kids[0].get_name().eq("csymbol") && kids[0].get_all_content().eq("subscript"){
return res;
}else if kids[0].get_name().eq("csymbol") && kids[0].get_all_content().eq("superscript") &&
cnml_is_numeric(&kids[1]) && kids[2].get_all_content().eq("′′"){
let mut amb = Ambiguity::new(node.get_property("xref").unwrap(), kids[1].get_property("xref").unwrap(), kids[2].get_property("xref").unwrap());
let mut qe = SpottedQE::new(Some(document.dom.node_to_string(&kids[1])));
let unit = Unit::new("".to_string(),"arcsec".to_string(),"1".to_string());
qe.add_mul_unit(unit);
amb.add_ambiguity(qe);
res.push(amb);
}else if kids[0].get_name().eq("csymbol") && kids[0].get_all_content().eq("superscript") &&
cnml_is_numeric(&kids[1]) && kids[2].get_all_content().eq("′"){
let mut amb = Ambiguity::new(node.get_property("xref").unwrap(), kids[1].get_property("xref").unwrap(), kids[2].get_property("xref").unwrap());
let mut qe = SpottedQE::new(Some(document.dom.node_to_string(&kids[1])));
let unit = Unit::new("".to_string(),"arcmin".to_string(),"1".to_string());
qe.add_mul_unit(unit);
amb.add_ambiguity(qe);
res.push(amb);
}else if kids[0].get_name().eq("csymbol") && (kids[0].get_all_content().eq("subscript") || kids[0].get_all_content().eq("superscript")){
found = true;
}
......@@ -423,7 +439,9 @@ pub fn case_divide_apply(node : &Node, kids : &[Node], document : &Document, con
my_qe.div_units.append(&mut qe.mul_units);
}
amb.add_ambiguity(my_qe);
amb.add_ambiguity(my_qe.clone());
println!("found {}", my_qe.to_string());
return Some(amb);
}
......
......@@ -112,6 +112,9 @@
3 & $-29^{\circ}50^{\prime}58^{\prime\prime}.7$ &
\verb|$-29^{\circ}50^{\prime}| & \cite{astro-ph/9807152} \\ & &
\verb|58^{\prime\prime}.7$| & \\
4 & $0.025^{\circ}C$ & \verb|$0.025^{\circ}C$| & \cite{patt-sol/9807001} \\
5 & $2\;^{\circ}$ C & \verb|$2\;^{\circ}$|C & \cite{gr-qc/9211005} \\
6 & $10^{\prime\prime}$ & \verb|$10^{\prime\prime}$| & \cite{gr-qc/9211006} \\
\hline
\end{tabular}
\caption{Examples of the category of quantity
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment