Commit b929dc9b authored by Michael Kohlhase's avatar Michael Kohlhase
Browse files

moved here from the svn

parents
/*
---------------------DOMExport-----------------------
Part 1 of software for Connexions definitions mapping:
A program which scans through a file and when a definition or
a term is found it records in the upper directory in the file
Definitions a row with:
1) name of the definition
2) name of folder
3) id of the definition in that document
In file Dterms the program adds a row with:
1) name of the term
2) link to the file where the term is found
In file Daddresses are extracted the links to the DTDs.
After being run this program all the definitions from the
documents are extracted into Definitions and all terms which
are not linked to definitions are recorded into the file
Dterms. The original addresses will be stored into Daddresses.
The Definitions and Dterms files will be used for phase 2.
Author: Vladimir Kirilov
Supervisor: Prof. Michael Kohlhase
Date: 14 May 2004
The file has to be compiled and linked with Xerces C++ APIs.
*/
// Includes for the Apache DOM Parser
#include <xercesc/dom/DOM.hpp>
#include <xercesc/parsers/XercesDOMParser.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/framework/LocalFileInputSource.hpp>
#include <xercesc/dom/DOMNodeIterator.hpp>
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/parsers/AbstractDOMParser.hpp>
#include <xercesc/dom/DOMImplementation.hpp>
#include <xercesc/dom/DOMImplementationLS.hpp>
#include <xercesc/dom/DOMImplementationRegistry.hpp>
#include <xercesc/dom/DOMBuilder.hpp>
#include <xercesc/dom/DOMException.hpp>
#include <xercesc/dom/DOMDocument.hpp>
#include <xercesc/dom/DOMNodeList.hpp>
#include <xercesc/dom/DOMError.hpp>
#include <xercesc/dom/DOMLocator.hpp>
#include <xercesc/dom/DOMNamedNodeMap.hpp>
#include <xercesc/dom/DOMAttr.hpp>
XERCES_CPP_NAMESPACE_USE
// Standard library includes
#include <iostream>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <string>
#include <dirent.h>
#include <sys/types.h>
#include <ctype.h> // for using tolower()
using namespace std;
/*
Function exporting the DTD line of the file to an external file and
inserts the path to the local DTDs.
*/
int correctlib(string& folder){
FILE *src;
FILE *dest;
FILE *Daddress;
string line = ""; // holds first element
string newline = ""; // holds the secont element
char* linee = NULL;
size_t len = 0;
int read = 0;
// Modify the path.
string path = "";
string dtd = "";
string temp;
char* local = NULL;
char tmp;
src = fopen (folder.c_str(), "r"); // open the source for reading
if (src == NULL) cout << "################################PROBLEM opening file#################################" << endl;
//get the first tag... <! ... >
while (1) {
tmp = fgetc(src);
newline += tmp;
if (tmp == '>') break;
}
newline += '\n';
int k = 0;
// get the <!DOCTYPE line >
while (1) {
tmp = fgetc(src);
if (tmp == '\n') continue; // if we get a newline don't add it to the tag
line += tmp; // adding the character to the end of the line
// this version removes any comments before the <!DOCUMENT > tag
if ((tmp == '>') && ((k = line.find("<!DOCTYPE")) != -1)) break; // if the current element is <! DOCUMENT ...
if (tmp == '>') line = ""; // if we reach the end of a tag and it's not the one we need
}
line += '\n';
int i = 4;
// Extracting the DTD name from the second element if the link hasn't already been updated
while (1) {
if ((line[i-1] == 'D') && (line[i-2] == 'T') && (line[i-3] == 'D') && (line [i] == '/')){
while (line [i] != '"'){
dtd += line[i];
i++;
if ((line[i] == '\'')) break;
}
break;
}
i++;
if (i > 256) {
fclose (src);
return -1; // then this file has probably been parsed already
}
}
string t = "";
t = folder + "\t" + line;
// Store the line in another file before modifying it
Daddress = fopen ("Daddresses", "a+");
if (Daddress == NULL) cout << "################################PROBLEM (with Daddress file)#################################" << endl;
fputs (t.c_str(), Daddress); // actually pasting the line in the saving file.
// add the folder name in a future version as well and leave a tab - could be useful
fclose (Daddress);
path = "<!DOCTYPE document PUBLIC \"-//CNX//DTD CNXML 0.5 plus MathML//EN\" \"../libs" + dtd + "\">";
dest = fopen ("Destination", "a+"); // open the destination for appending at the end
if (dest == NULL) cout << "################################PROBLEM (with Destination file)#################################" << endl;
fputs (newline.c_str(), dest);
fputs (path.c_str(), dest);
do {
read = getline(&linee, &len, src);
fputs (linee, dest);
} while (read != -1);
// if the program has already been run we want to avoid overwriting the paths from the previous file
if (!strcmp(path.c_str(), line.c_str())) return -1;
fclose (src);
fclose (dest);
temp = "";
temp = "mv Destination " + folder;
system (temp.c_str()); // the newly created file is pasted upon Destination
return 0;
}
// Function which counts the number of directories to be worked on
int countdirs (){
int folders = 0;
struct dirent *dptr;
DIR *ptr;
string name;
ptr = opendir ("../cnxml05");
while ((dptr = readdir(ptr)) != NULL){
name = dptr->d_name;
if ((name[0] == '.') || (name[0] == 'D') || (name[0]) == 'l') continue;
else folders++;
}
closedir (ptr);
return folders;
}
// DOMExport
int main (int argc, char** argv){
// Time variables
time_t start, end;
time (&start);
// Extracted data from a file
string defid = ""; // definition id (will be used as #id to point to the place)
string path = ""; // path to the file
string term = ""; // the name of the term
FILE* definitions; // pointer to Definitions file
FILE* terms; // pointer to Dterms file
// Counting the number of directories in the folder:
int dirs = 0; // number of folders
dirs = countdirs();
int count = 0; // counter for the number of files passed;
// Reading the directories in the file:
DIR *dirspointer;
struct dirent *dirp;
string currentdir; // holds the name of the current directory
int def = 0; // flag for occured definition
int termscounter = 0; // counts elements with source
int termscounter2 = 0; // counts elements without source attribute
dirspointer = opendir("../cnxml05"); // hardcoding the assumed folder the program is run in - cnxml05
while ((dirp = readdir(dirspointer)) != NULL){
currentdir = dirp->d_name; // passes the name of the directory in the char array
currentdir+= "/index.cnxml"; // mapping to the file to be worked on
path = currentdir; // save the path of the file with the definition;
if ((currentdir[0] == '.') || (currentdir[0] == 'l') || (currentdir[0] == 'D')) {
continue; // avoid non cnxml folders
}
count++;
time (&end);
double runtime;
runtime = difftime (end, start);
cout << "The program finished working on " << count << " files in: " << runtime << " seconds." << endl;
cout << "Current path to file: " << currentdir << endl;
int status = 0;
// calling the function to modify the parsed file, if the file has already been modified continue with the next file
if ((status = correctlib (currentdir)) == -1) continue;
// initialize the parser and load an XML document as a DOM tree
// use DOMParser object does all the work
// create a pointer to the parsed document
XERCES_CPP_NAMESPACE_QUALIFIER DOMDocument *doc = 0;
bool bFailed = false;
// initialize XML library
XMLPlatformUtils::Initialize();
//create a new parser instance
XercesDOMParser *parser = new XercesDOMParser;
// if such an instance exists
if (parser){
parser->setValidationScheme(XercesDOMParser::Val_Auto);
parser->setDoNamespaces(false);
parser->setDoSchema(false);
parser->setCreateEntityReferenceNodes(false);
try {
XMLCh *src;
src = XMLString::transcode (currentdir.c_str()); // takes name of the source
LocalFileInputSource source (src);
parser -> parse (src); // create a pointer to the parsed document
doc = parser->getDocument(); // update the pointer to the document
if (bFailed){
std::cerr << "Parsing " << currentdir;
std::cerr << " error count: " << parser->getErrorCount() << std::endl;
}
}
catch (const DOMException& e){
std::cerr << "DOM Exception parsing ";
std::cerr << " reports: ";
if (e.msg){
char *strMsg = XMLString::transcode(e.msg);
std::cerr << strMsg << std::endl;
XMLString::release(&strMsg);
}
else
std::cerr << e.code << std::endl;
bFailed = true;
}
catch (const XMLException& e){
std::cerr << "XML Exception parsing ";
std::cerr << currentdir;
std::cerr << " reports: ";
std::cerr << e.getMessage() << std::endl;
bFailed = true;
}
// If the input document is parsed then enter here.
if (!bFailed){
DOMNode *root;
DOMNode *current = NULL;
root = (DOMElement*)doc->getDocumentElement();
DOMNodeIterator* iterator = doc->createNodeIterator (root, DOMNodeFilter::SHOW_ALL, NULL, true);
for (current = iterator->nextNode(); current != 0; current = iterator->nextNode()){
char* nodename = XMLString::transcode (current->getNodeName());
string tmp;
tmp = nodename;
// if it is a term outside a definition
if ((tmp == "term") && (def == 0)) {
// ------------------------- TERM without DEFINITION
string tempo; // it will hold the name of the attribute
DOMNamedNodeMap *smth = current->getAttributes();
int nattr = smth->getLength();
for(int i=0;i<nattr;++i) {
DOMAttr *pAttributeNode = (DOMAttr*) smth->item(i);
// get attribute name
if (i == 0){
char *name = XMLString::transcode(pAttributeNode->getName());
tempo = name;
XMLString::release(&name);
}
}
if (!strcmp (tempo.c_str(), "src")) {
termscounter++;
// if the term has attribute (src)
current = iterator->nextNode(); // go to the next element node;
}
else {
termscounter2++;
def = 3; // if it doesn't have attributes set def to 3
}
}
XMLString::release(&nodename);
nodename = XMLString::transcode (current->getNodeName());
tmp = nodename;
// if the current node is a term in a definition
if ((tmp == "term") && (def == 1)){
def = 2;
}
//checking if the current node is definition
if (tmp == "definition"){
def = 1;
}
// traverses the attributes of DEFINITION and extracts the id
if ((current->hasAttributes()) && (def == 1)){
DOMNamedNodeMap *pAttributes = current->getAttributes();
int nSize = pAttributes->getLength();
for(int i=0;i<nSize;++i) {
DOMAttr *pAttributeNode = (DOMAttr*) pAttributes->item(i);
// get attribute name
if (i == 0){
char *name = XMLString::transcode(pAttributeNode->getName());
XMLString::release(&name);
// get attribute type
name = XMLString::transcode(pAttributeNode->getValue());
defid = name; // save the name of the definition id
XMLString::release(&name);
}
}
}
// if the current node is a text node of a term parent which is not after a definition
if((current->getNodeType()==DOMNode::TEXT_NODE) && (def == 3)) { // if the term is without a src attribute
DOMNode* attr= current;
char* printtext;
printtext=XMLString::transcode(attr->getNodeValue());
term = printtext; // save the term name of the definition
def = 0; // set def to default
/* Store everything in a file after having all data. */
terms = fopen ("Dterms", "a+");
if (terms == NULL) cout << "###################PROBLEM (with Dterms file)###############" << endl;
string tmp = "";
int k = 0;
char ch;
// converting the term name to lowercase
while (term[k] != '\0'){
ch = tolower (term[k]);
tmp = tmp + ch;
k++;
}
tmp = tmp + "\t" + "../" + path + "\n";
fputs (tmp.c_str(), terms);
fclose (terms);
tmp = "";
term = "";
defid = "";
XMLString::release(&printtext);
}
// if the node is the text after a term in a definition
if((current->getNodeType()==DOMNode::TEXT_NODE) && (def == 2)) {
DOMNode* attr= current;
char* printtext;
printtext=XMLString::transcode(attr->getNodeValue());
term = printtext; // save the term name of the definition
def = 0;
/* Store everything in a file after having all data. */
definitions = fopen ("Definitions", "a+");
if (definitions == NULL) cout << "###################PROBLEM (with Definitions file)###############" << endl;
string tmp = "";
int k = 0;
char ch;
// converting the term name to lowercase
while (term[k] != '\0'){
ch = tolower (term[k]);
tmp = tmp + ch;
k++;
}
tmp = tmp + "\t" + "../" + path + "#" + defid + "\n";
fputs (tmp.c_str(), definitions);
fclose (definitions);
tmp = "";
term = "";
defid = "";
XMLString::release(&printtext);
}
XMLString::release(&nodename);
}
}
}
delete parser;// frees memory of the parser
XMLPlatformUtils::Terminate();
} // closing the while loop
closedir (dirspointer); // closing the directory reading
cout << "FINISHED!!! Please press enter and proceed to the next step!" << endl;
time (&end);
double runtime;
runtime = difftime (end, start);
cout << "The program finished working on " << dirs << " files in: " << runtime << " seconds." << endl;
cout << "Linked terms: " << termscounter << endl;
cout << "Uninked terms: " << termscounter2 << endl;
return 0;
}
/*
----------------------------Dchecker------------------------------
Program which manipulates Definitions and Dterms.
1) Orders Definitions alphabetically and eliminates the redundant definitions.
2) Scans through the terms file and checks if such a term has already been defined
into the Definitions file. If so inserts the file to be updated into another file
called Dupdate.
Author: Vladimir Kirilov
Date: 12 May 2004
*/
#include <iostream>
#include <string>
#include <vector>
using namespace std;
int main (){
FILE* definitions;
FILE* terms;
FILE* updates;
vector <string> defs1; // it will hold the names of the definitions
vector <string> defs2; // it will hold the paths to the definitions
vector <string> terms1; // it will hold the names of the terms
vector <string> terms2; // it will hold the paths to the terms
string temp;
int i = 0;
char* line = NULL;
size_t len = 0;
int read = 0;
int mark = 0;
/* ************************************************************** */
// Read the Definitions log file
definitions = fopen ("Definitions", "r");
if (definitions == NULL) cout << "Failed to open file Definitions." << endl;
//put the definitions into memory
do {
read = getline (&line, &len, definitions); // read the first line from the source
if ( read == -1) {
break;
}
temp = line;
i = temp.find ('\t');
temp = temp.substr (0, i); // temp will hold the name of the term
for (i = 0; (unsigned)i < defs1.size(); i++){
string tmp;
tmp = defs1[i];
if ( !strcmp (tmp.c_str(), temp.c_str())) break; // if the two strings are the same
}
mark = 0; // default for non-stored
// if no such term was found in the list
if ((unsigned)i == defs1.size()){
defs1.push_back(temp); // store the word into the vector
mark = 1; // if we store a path in the first vector
}
temp = line;
int l = temp.find ('\t');
int k = temp.find ('\n');
temp = temp.substr (l+1, k-l-1); // temp will hold the name of the path
if ((i = temp.find('\n')) != -1){
temp[i] = '\0';
}
if (mark == 1)
defs2.push_back(temp); // store the word into the vector
temp = "";
} while (read != -1);
fclose (definitions);
// ********************************************************** //
// Read the terms log file.
terms = fopen ("Dterms", "r");
if (terms == NULL) cout << "Failed to open file Dterms." << endl;
do {
read = getline (&line, &len, terms); // read the first line from the source
if ( read == -1) {
break;
}
temp = line;
i = temp.find ("\t../");
if (i == -1) {
getline (&line, &len, terms); // read one more line if we cannot find the whole on this line
temp += line; // combine the 2 lines into one
i = temp.find ("\t../");// find the string now.
}
if (i == 0) continue; // if the term name is not entered
string tmp = "";
tmp = temp.substr (0, i); // temp will hold the name of the term
// if there is still a tab in the string substitute it with a single character
if ((i = tmp.find('\n')) != -1){
tmp.replace(i, tmp.length(), tmp, i+1, tmp.length());
}
if ((i = tmp.find('\t')) != -1){
tmp[i] = ' ';
}
terms1.push_back(tmp); // store the term into the vector
int l = temp.find ("\t../");
int k = temp.find ('\n');
temp = temp.substr (l+1, k-l-1); // temp will hold the name of the path
if ((i = temp.find('\n')) != -1){
temp.erase (i, 1);
}
terms2.push_back(temp); // store the path into the vector
temp = "";