Commit 128781a8 authored by ulrich's avatar ulrich

adaptions in the source files

parent 0b2640cb
......@@ -2,11 +2,12 @@ import sys
import os
import copy
from bs4 import BeautifulSoup, Tag, NavigableString
from astropy import units as astropy_units
from concurrent.futures import ThreadPoolExecutor
from astropy import units as astropy_units
translation = {"M_sun" : "solar masses", "AA" : "Angstrom"}
executor = ThreadPoolExecutor(max_workers=3)
#executor = ThreadPoolExecutor(max_workers=2)
def read_annotations(rdf_soup):
annotations = []
......@@ -639,26 +640,28 @@ def process(html_soup, rdf_soup, output_path):
file.write(html)
def process_file(path_file, html_path, rdf_path, output_path):
print("path " + path_file)
#print("path " + path_file)
_relative_html = path_file[len(html_path):]
_relative_rdf = _relative_html.replace(".html",".kat.xml")
_path_rdf = rdf_path + "/" + _relative_rdf
_output_file = output_path + "/" + _relative_html
_output_folder = _output_file[:(len(_output_file) - _output_file[::-1].index("/"))]
print("output " + _output_folder)
#print("output " + _output_folder)
if not os.path.isdir(_output_folder):
os.makedirs(_output_folder)
_html_file = open(path_file)
_rdf_file = open(_path_rdf)
_html_soup = BeautifulSoup(_html_file, 'html.parser')
_rdf_soup = BeautifulSoup(_rdf_file, 'html.parser')
_html_file = open(path_file, encoding="utf-8")
_rdf_file = open(_path_rdf, encoding="utf-8")
_html_soup = BeautifulSoup(_html_file, 'html.parser', from_encoding="utf-8")
_rdf_soup = BeautifulSoup(_rdf_file, 'html.parser', from_encoding="utf-8")
try:
process(_html_soup, _rdf_soup, _output_file)
print("success: " + path_file)
except _:
except:
print("error: " + path_file)
_html_file.close()
_rdf_file.close()
print("Error: " + str(sys.exc_info()[0]))
else:
_html_file.close()
_rdf_file.close()
if len(sys.argv) != 4:
......@@ -697,8 +700,13 @@ if os.path.isdir(html_path) and os.path.isdir(rdf_path):
html_docs.append(comp)
# process_file(html_docs[0], html_path, rdf_path, output_path)
for doc in html_docs:
try:
process_file(doc, html_path, rdf_path, output_path)
except:
print("error at" + doc)
executor.map(lambda path: process_file(path, html_path, rdf_path, output_path), html_docs)
# executor.map(lambda path: process_file(path, html_path, rdf_path, output_path), html_docs)
......
......@@ -34,13 +34,16 @@ pub fn main() {
let input_path = args[2].to_owned();
let document_path = args[1].to_owned();
// let pool = ThreadPool::new(CORES);
let pool = ThreadPool::new(CORES);
let (tx, rx) = channel();
let mut mut_corpus = Corpus::new(document_path.clone());
let mut documents_vec = Vec::new();
let mut desc_counter = 0;
let mut global_m_counter = 0;
for my_document in mut_corpus.iter(){
let my_tx = tx.clone();
......@@ -50,15 +53,23 @@ pub fn main() {
let my_annotation_document = my_document.path.replace(&document_path, &input_path).replace(".html",".kat.xml");
documents_vec.push(my_document.path.clone());
println!("document {}", my_document_path.clone());
// pool.execute(move || {
read_document(my_document_path.clone(), &my_annotation_document, &my_config);
my_tx.send(my_document_path.clone()).unwrap();
// });
pool.execute(move || {
let (d,m) = read_document(my_document_path.clone(), &my_annotation_document, &my_config);
// desc_counter = desc_counter + d;
// global_m_counter = global_m_counter + m;
let desc_str = format!("d {}", d);
let glob_str = format!("g {}", m);
my_tx.send(desc_str).unwrap();
my_tx.send(glob_str).unwrap();
});
}
let number_of_documents = documents_vec.len();
......@@ -66,17 +77,31 @@ pub fn main() {
let timeout = Duration::from_secs(200);
for i in 0..number_of_documents{
for i in 0..(2*number_of_documents){
let res = rx.recv_timeout(timeout);
if res.is_ok(){
let succ_doc = res.unwrap();
successful_documents.push(succ_doc.clone());
println!("finished ({}/{}) {}", (i+1), number_of_documents, succ_doc);
if succ_doc.starts_with("d"){
println!("{}", succ_doc);
let d = format!("{}", &succ_doc[2..]);
println!("d {}",d);
let di = d.parse::<i32>().unwrap();
println!("di {}", di);
desc_counter = desc_counter + di;
}else if succ_doc.starts_with("g"){
let d = format!("{}", &succ_doc[2..]);
let di = d.parse::<i32>().unwrap();
println!("gi {}", di);
global_m_counter = global_m_counter + di;
}
// successful_documents.push(succ_doc.clone());
// println!("finished ({}/{}) {}", (i+1), number_of_documents, succ_doc);
}else{
println!("timeout");
// println!("timeout");
}
}
println!("number of annotations {}, number of meanings {}", desc_counter, global_m_counter);
let mut doc_panic = vec![];
for document in documents_vec{
......@@ -86,16 +111,16 @@ pub fn main() {
}
for document in doc_panic{
println!("Panic at {}", document);
// println!("Panic at {}", document);
}
}
pub fn read_document(document_path : String, annotation_location : &str, config : &Config){
pub fn read_document(document_path : String, annotation_location : &str, config : &Config) -> (i32, i32){
let parser = Parser::default();
let opt_doc = parser.parse_file(annotation_location);
if opt_doc.is_err(){
println!("Cannot parse file {}", annotation_location);
return;
return (0, 0);
}
let annotation_document = opt_doc.unwrap();
......@@ -105,11 +130,13 @@ pub fn read_document(document_path : String, annotation_location : &str, config
let mut root = annotation_document.get_root_element();
find_content(&mut root, &annotation_document, &document, config);
return find_content(&mut root, &annotation_document, &document, config);
}
fn find_content(node : &mut Node, annotation_document : &Document, original_document : &lDoc, config : &Config) {
let mut cnml = false;
fn find_content(node : &mut Node, annotation_document : &Document, original_document : &lDoc, config : &Config) -> (i32, i32) {
let mut cnml = false;
let mut desc_counter = 0;
let mut global_m_counter = 0;
match node.get_type().unwrap() {
NodeType::ElementNode => {
if node.get_name().eq("Description"){
......@@ -135,7 +162,11 @@ fn find_content(node : &mut Node, annotation_document : &Document, original_docu
c = child.get_next_sibling();
}
println!("Meanings {}", counter);
if counter > 0{
println!("Meanings {}", counter);
desc_counter = desc_counter + 1;
global_m_counter = global_m_counter + counter;
}
}
}
_ => { }
......@@ -144,15 +175,15 @@ fn find_content(node : &mut Node, annotation_document : &Document, original_docu
if !cnml {
let mut c: Option<Node> = node.get_first_child();
while let Some(mut child) = c {
find_content(&mut child, annotation_document, original_document, config);
let (d,m) = find_content(&mut child, annotation_document, original_document, config);
desc_counter = desc_counter + d;
global_m_counter = global_m_counter + m;
c = child.get_next_sibling();
}
}
return (desc_counter, global_m_counter);
}
......
......@@ -19,7 +19,7 @@ use libxml::tree::Document as DOM;
static KAT_QE : &'static str = "KAT_1_QuantityExpression";
pub static CORES : usize = 9;
pub static CORES : usize = 20;
/* Returns the content of the "annotation" child of a math node, which contains the tex-code */
pub fn get_tex_from_math(math_node : Node) -> String{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment