Commit f99de261 authored by Ulrich's avatar Ulrich

changing everything to parallel

parent 58c3059d
......@@ -458,7 +458,7 @@ def process_annotation(html_soup, rdf_soup, a):
else:
#textual or only partly math
div = html_soup.new_tag("div")
# div['role'] = "none"
div['role'] = "math"
div['aria-label'] = label
div['style'] = "display:inline"
......@@ -468,12 +468,20 @@ def process_annotation(html_soup, rdf_soup, a):
break
for node in annotation_nodes:
if node.name in math_name:
math = html_soup.new_tag("math")
math.append(node)
div.append(math)
else:
div.append(node)
node.extract()
math = html_soup.new_tag("math")
math.append(mrow)
div.append(math)
# for node in annotation_nodes:
# if node.name in math_name:
# math = html_soup.new_tag("math")
# math.append(node)
# div.append(math)
# else:
# div.append(node)
......
......@@ -38,6 +38,7 @@ regex = "0.1"
time = "0.1"
unidecode = "0.2.0"
fs_extra = "0.2.0"
threadpool = "1.3.2"
[dependencies.rust-senna]
git = "https://github.com/jfschaefer/rust-senna.git"
[dependencies.libxml]
......
.PHONY: all run
#HTML = ../PapersFromCortex/small_eval/hep-ph9807232.html
#HTML = ../../KAT/content/physics9807021.html
#HTML = ../../KAT/content/astro-ph9807152.html
#HTML = ../../KAT/content/astro-ph9807288.html
#HTML = ../../KAT/content/hep-ph9807532.html
#HTML = ../../KAT/content/cond-mat9807111.html
HTML = ../PapersFromCortex/small_eval/hep-ph9807306.html
#HTML = ../PapersFromCortex/astro-ph9807152/object.html
all:
cargo test
nodes:
cargo run --bin print_nodes ${HTML}
tokenize:
cargo run --bin tokenizer_bin ${HTML}
html:
firefox ${HTML}
kat:
cargo run --bin spotter_bin kat ${HTML}
convert:
cargo run --bin spotter_bin convert ${HTML}
harvest:
cargo run --bin spotter_bin harvest ${HTML}
debug: all
RUST_BACKTRACE=1 ./target/debug/examples/spotter kat ${HTML}
# declaration-spotter
A declaration spotter based on the llamapun library
# QE spotter
A spotter for quantity expressions based on the llamapun library
#Getting started
Use the Makefile
Workflow:
Preprocess the documents, run
```cargo run --bin preprocess Inputpath Outputpath```
......@@ -17,15 +15,15 @@ Run the scorer with
```cargo run --bin scoring Outputpath AnnotationOutputpath```
to add scores to the annotations.
Create a harvest with
```cargo run --bin harvesting AnnotationOutputpath Harvest-outputpath```
Where inputpath is a directory containing the plain html files (possibly in subfolders),
outputpath is a dir where the tokenized files with the jobad header are stored
and AnnotationOutputpath is the place where annotations are stored.
Use
```cargo build --release```
for optimizations
Use the release flag for optimizations, i.e.
```cargo run --release --bin spotter Outputpath AnnotationOutputpath```
Deprecated:
Run
......
......@@ -3,16 +3,19 @@ extern crate libxml;
extern crate senna;
extern crate spotter_lib;
extern crate fs_extra;
extern crate threadpool;
use std::env;
use fs_extra::dir::*;
//use std::thread;
use llamapun::data::Corpus;
//use llamapun::data::Document;
use spotter_lib::tokenize::tokenize;
use spotter_lib::util::add_jobad_support;
//use rayon::par_iter::*;
use spotter_lib::util::*;
use std::time::Duration;
use threadpool::ThreadPool;
use std::sync::mpsc::channel;
pub fn main() {
let args : Vec<_> = env::args().collect();
......@@ -50,38 +53,55 @@ pub fn main() {
let mut corpus = Corpus::new(new_output.to_owned());
//let corpus_path_len = corpus.path.clone().len();
for document in corpus.iter() {
println!("Preprocessing {}", document.path);
let pool = ThreadPool::new(CORES);
let (tx, rx) = channel();
let mut documents_vec = vec![];
for document in corpus.iter() {
documents_vec.push(document.path.clone());
let my_document_path = document.path.clone();
let my_tx = tx.clone();
pool.execute(move || {
let th_corpus = Corpus::new(my_document_path.clone());
let th_document = th_corpus.load_doc(my_document_path.clone()).unwrap();
let mut dom = tokenize(&th_document.dom);
dom = add_jobad_support(dom);
dom.save_file(&my_document_path).unwrap();
my_tx.send(my_document_path).unwrap();
});
}
let mut dom = tokenize(&document.dom);
dom = add_jobad_support(dom);
dom.save_file(&document.path).unwrap();
let number_of_documents = documents_vec.len();
/*
let mut successful_documents = vec![];
let relative;
if document.path.eq(corpus_path){
relative = document.path.split("/").last().unwrap();
}else {
relative = &document.path[corpus_path_len..];
let timeout = Duration::from_secs(200);
for i in 0..number_of_documents{
let res = rx.recv_timeout(timeout);
if res.is_ok(){
let succ_doc = res.unwrap();
successful_documents.push(succ_doc.clone());
println!("finished ({}/{}) {}", i, number_of_documents, succ_doc);
}else{
println!("timeout");
}
}
let output = format!("{}/{}", output_path, relative);
let folder = &output[..output.as_str().rfind("/").unwrap()];
let mut doc_panic = vec![];
match create_dir_all(folder) {
Ok(_) => {},
Err(_) => {
println!("Could not create folder {}\n skipping file", folder);
continue;
}
for document in documents_vec{
if !successful_documents.contains(&document){
doc_panic.push(document);
}
}
println!("Saving at {}", &output);
dom.save_file(&output).unwrap();
*/
for document in doc_panic{
println!("Panic at {}", document);
}
}
......@@ -2,6 +2,8 @@ extern crate llamapun;
extern crate libxml;
extern crate senna;
extern crate spotter_lib;
extern crate threadpool;
use std::fs;
use std::env;
......@@ -14,6 +16,10 @@ use libxml::parser::Parser;
use llamapun::data::Corpus;
use llamapun::data::Document as lDoc;
use llamapun::pattern_example_adaption::get_declarations;
use threadpool::ThreadPool;
use std::sync::mpsc::channel;
use std::time::Duration;
/* This adds scores to ambiguities of an annotation. The higher the score, the more likely the annotation. */
......@@ -22,7 +28,7 @@ pub fn main() {
let args : Vec<_> = env::args().collect();
if args.len() <= 2{
println!("usage: cargo run --bin scoring AnnotationPath DocumentPath");
println!("usage: cargo run --bin scoring DocumentPath AnnotationPath");
return;
}
......@@ -31,36 +37,81 @@ pub fn main() {
let input_path = args[2].to_owned();
let document_path = args[1].to_owned();
search_dir(Path::new(&input_path), Path::new(&input_path), Path::new(&document_path), &config);
let doc_vec = search_dir(Path::new(&input_path));
let pool = ThreadPool::new(CORES);
let (tx, rx) = channel();
for my_annotation_path in doc_vec.clone(){
let my_tx = tx.clone();
let my_config = config.clone();
let my_document_path = document_path.clone();
let my_input_path = input_path.clone();
pool.execute(move || {
read_document(&my_annotation_path, Path::new(&my_input_path), Path::new(&my_document_path), &my_config);
my_tx.send(my_annotation_path).unwrap();
});
}
let number_of_documents = doc_vec.len();
let mut successful_documents = vec![];
let timeout = Duration::from_secs(200);
for i in 0..number_of_documents{
let res = rx.recv_timeout(timeout);
if res.is_ok(){
let succ_doc = res.unwrap();
successful_documents.push(succ_doc.clone());
println!("finished ({}/{}) {}", i, number_of_documents, succ_doc);
}else{
println!("timeout");
}
}
let mut doc_panic = vec![];
for document in doc_vec{
if !successful_documents.contains(&document){
doc_panic.push(document);
}
}
for document in doc_panic{
println!("Panic at {}", document);
}
}
pub fn search_dir(dir_path : &Path, old_path : &Path, document_path : &Path, config : &Config){
pub fn search_dir(dir_path : &Path) -> Vec<String>{
let opt_path = fs::read_dir(&dir_path);
let mut doc_vec = vec![];
if opt_path.is_err(){
println!("error reading dir {}", &dir_path.display());
return;
return doc_vec;
}
let dir_path = opt_path.unwrap();
for path in dir_path {
let entry = path.unwrap();
let p = entry.path();
let name = p.display();
if entry.metadata().is_ok() && entry.metadata().unwrap().is_dir(){
search_dir(&p, old_path, document_path, config);
doc_vec.append(&mut search_dir(&p));
}else{
let string = format!("{}",name);
if string.ends_with(".kat.xml") {
println!("Scoring {}", name);
read_document(&string, old_path, document_path, config);
doc_vec.push(string.clone());
//read_document(&string, old_path, document_path, config);
}
}
}
doc_vec
}
pub fn read_document(s : &str, old_path : &Path, document_path : &Path, config : &Config){
......@@ -80,7 +131,7 @@ pub fn read_document(s : &str, old_path : &Path, document_path : &Path, config :
let offset = &s[old_str.len()..];
let mut original_document = format!("{}/{}",document_path.display(), offset);
original_document = original_document.replace(".kat.xml",".html");
println!("loading document {}", original_document);
// println!("loading document {}", original_document);
let corpus = Corpus::new(original_document.clone());
let opt_orig_doc = corpus.load_doc(original_document.clone());
......@@ -92,7 +143,7 @@ pub fn read_document(s : &str, old_path : &Path, document_path : &Path, config :
let orig_doc = opt_orig_doc.unwrap();
//call Frederiks program
println!("Spotting declarations...");
// println!("Spotting declarations...");
let decl_vec = get_declarations(original_document.clone());
......@@ -101,12 +152,12 @@ pub fn read_document(s : &str, old_path : &Path, document_path : &Path, config :
let mut declaration_symbols = Vec::new();
for decl in decl_vec{
println!("declaration {}", decl.clone());
// println!("declaration {}", decl.clone());
let opt_decl_nodes = xpath_context.evaluate(&decl);
if opt_decl_nodes.is_err(){
println!("error getting declaration at {}", decl);
// println!("error getting declaration at {}", decl);
continue;
}
......@@ -115,10 +166,10 @@ pub fn read_document(s : &str, old_path : &Path, document_path : &Path, config :
let content = get_all_content(&decl_node);
declaration_symbols.push(content);
println!("node content {}", get_all_content(&decl_node));
// println!("node content {}", get_all_content(&decl_node));
}
println!("done.");
// println!("done.");
// for decl in decl_vec{
// println!("decl {}", decl);
......
......@@ -5,6 +5,7 @@ extern crate regex;
extern crate time;
extern crate unidecode;
extern crate spotter_lib;
extern crate threadpool;
use llamapun::data::{Document, Corpus};
......@@ -15,8 +16,12 @@ use libxml::tree::Node;
use std::env;
use std::fs::*;
use std::time::Duration;
use spotter_lib::data::*;
use spotter_lib::util::*;
use threadpool::ThreadPool;
use std::sync::mpsc::channel;
//use regex::RegexSet;
......@@ -35,81 +40,110 @@ pub fn main() {
let corpus_path = args[1].to_owned();
let annotation_path = args[2].to_owned();
let corpus_path_len = corpus_path.clone().len();
println!("Loading corpus from {}", corpus_path);
let mut mut_corpus = Corpus::new(corpus_path.clone());
let config = parse_config("config.txt");
for mut document in mut_corpus.iter() {
let path = document.path.clone();
//let mut children = vec![];
let pool = ThreadPool::new(CORES);
let (tx, rx) = channel();
let mut documents_vec = vec![];
for document in mut_corpus.iter() {
documents_vec.push(document.path.clone());
let my_document_path = document.path.clone();
println!("Loading document {}", path);
let my_corpus_path = corpus_path.clone();
let my_annotation_path = annotation_path.clone();
let my_config = config.clone();
let file_name = path.split("/").last().unwrap();
let my_tx = tx.clone();
pool.execute(move || {
handle_document(my_document_path.clone(), my_corpus_path, my_annotation_path, &my_config);
my_tx.send(my_document_path).unwrap();
});
}
let new_path;
let relative;
if path.eq(&corpus_path){
new_path = annotation_path.clone();
relative = file_name;
let number_of_documents = documents_vec.len();
let mut successful_documents = vec![];
let timeout = Duration::from_secs(200);
for i in 0..number_of_documents{
let res = rx.recv_timeout(timeout);
if res.is_ok(){
let succ_doc = res.unwrap();
successful_documents.push(succ_doc.clone());
println!("finished ({}/{}) {}", i, number_of_documents, succ_doc);
}else{
relative = &path[corpus_path_len..];
let relative_folder = &relative[..(relative.len() - file_name.len())];
new_path = format!("{}/{}",annotation_path, relative_folder);
println!("timeout");
}
}
let mut doc_panic = vec![];
match create_dir_all(&new_path) {
Ok(_) => {},
Err(_) => {
println!("Could not create folder {}\n skipping file", new_path);
continue;
}
for document in documents_vec{
if !successful_documents.contains(&document){
doc_panic.push(document);
}
}
/*
let file_path = config.kat_content_dir.clone() + "out.html";
new_dom.save_file(&file_path).unwrap();
for document in doc_panic{
println!("Panic at {}", document);
}
let mut document = corpus.load_doc(file_path).unwrap();
*/
}
let mut ambiguities = Vec::new();
pub fn handle_document(path : String, corpus_path : String, annotation_path : String, config : &Config){
ambiguities.append(&mut evaluate_text(&mut document, &config));
// println!("done text");
ambiguities.append(&mut evaluate_math(&mut document, &config));
// println!("done math");
let corpus = Corpus::new(path.clone());
let mut document = corpus.load_doc(path.clone()).unwrap();
let save_location = format!("{}/{}", annotation_path.clone(),
relative.replace("html","kat.xml"));
println!("Saving annotation at {}", save_location.clone());
create_kat_export(&save_location, &ambiguities, &config, file_name);
let corpus_path_len = corpus_path.len();
}
// println!("Loading document {}", path);
/*
if args[1].eq("kat") {
create_kat_export(&config.kat_output_path, &ambiguities, &config);
}else if args[1].eq("convert"){
println!("Saving annotation at {}/annotations/{}", config.convert_output_dir.clone(), file_name.replace("html","kat.xml"));
create_kat_export(&(config.convert_output_dir.clone() + "/annotations/" + &file_name.replace("html","kat.xml")), &ambiguities, &config);
} else if args[1].eq("harvest") {
let mut docu_path = old_document.path.clone();
docu_path = docu_path.replace("html", "harvest");
let file_name = docu_path.split("/").last().unwrap();
let file_name = path.split("/").last().unwrap();
create_harvest_export(&config.harvest_output_dir, file_name, &mut ambiguities, &config);
} else if args[1].eq("aria"){
document.dom.save_file(&old_document.path.replace("html", "aria.html")).unwrap();
} else {
println!("unknown option {}", args[1]);
let new_path;
let relative;
if path.eq(&corpus_path){
new_path = annotation_path.clone();
relative = file_name;
}else{
relative = &path[corpus_path_len..];
let relative_folder = &relative[..(relative.len() - file_name.len())];
new_path = format!("{}/{}",annotation_path, relative_folder);
}
match create_dir_all(&new_path) {
Ok(_) => {},
Err(_) => {
println!("Could not create folder {}\n skipping file", new_path);
return;
}
*/
// }
}
let mut ambiguities = Vec::new();
ambiguities.append(&mut evaluate_text(&mut document, &config));
// println!("done text");
ambiguities.append(&mut evaluate_math(&mut document, &config));
// println!("done math");
let save_location = format!("{}/{}", annotation_path.clone(),
relative.replace("html","kat.xml"));
// println!("Saving annotation at {}", save_location.clone());
create_kat_export(&save_location, &ambiguities, &config, file_name);
println!("thread finished {}", path);
}
......
import sys
import os
if len(sys.argv) != 2:
print("usage:python3 unpack.py dir_name")
exit()
folder = os.getcwd() + "/" + sys.argv[1]
if not os.path.isdir(folder):
print("argument " + folder + " is not a folder")
exit()
print ("Searching in " + folder);
for f_out in os.listdir(folder):
if not os.path.isdir(folder + "/" + f_out):
continue
# print("entering " + folder + "/" + f_out)
os.chdir(folder + "/" + f_out)
for f_in in os.listdir("./"):
if f_in == "tex_to_html.zip":
os.system("unzip " + f_in)
print("unpacked " + f_out + "/" + f_in)
if f_in.endswith(".zip"):
os.remove(f_in)
......@@ -19,7 +19,7 @@ pub fn get_div_chars_string() -> Vec<String>{
}
#[derive(Debug)]
#[derive(Debug, Clone)]
pub struct Config{
pub prefix_map : HashMap<String,String>,
pub units_map : HashMap<String, String>,
......
......@@ -18,6 +18,9 @@ use libxml::tree::Document as DOM;
static KAT_QE : &'static str = "KAT_1_QuantityExpression";
pub static CORES : usize = 3;
/* Returns the content of the "annotation" child of a math node, which contains the tex-code */
pub fn get_tex_from_math(math_node : Node) -> String{
let semantics = math_node.get_first_child().unwrap();
......@@ -51,7 +54,7 @@ pub fn get_children_as_vec(node : &Node) -> Vec<Node>{
pub fn get_xref_node(node : &Node, document : &Document) -> Option<Node>{
let opt_xref = node.get_property("xref");
if opt_xref.is_none(){
println!("Node has not xref attribute in get_xref_content");
// println!("Node has not xref attribute in get_xref_content");
return None;
}
......@@ -68,10 +71,10 @@ pub fn get_xref_node(node : &Node, document : &Document) -> Option<Node>{
}
let nodes_vec = eval_res.unwrap().get_nodes_as_vec();
if nodes_vec.len() > 1 {
println!("more than 1 node with id {} in get_xref_content", xref);
// println!("more than 1 node with id {} in get_xref_content", xref);
return None;
}else if nodes_vec.len() == 0{
println!("no node with id {} in get_xref_content", xref);
// println!("no node with id {} in get_xref_content", xref);
return None;
}else{
return Some(nodes_vec[0].clone());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment