/*******************************************************
** Name: wet.h
** Author: Leo Liberti
** Source: GNU C++
** Purpose: www exploring topologizer - explore a
** neighbourhood of a given URL and store info,
** perform queries (header file)
** History: 060820 work started
*******************************************************/
#ifndef _WETWETH
#define _WETWETH
#include<string>
#include "timestamp.h"
#include "fileparser.h"
#include "htmlpage.h"
#include "url.h"
#include "vertex.h"
#include "vertexurl.h"
#include "digraph.h"
namespace WET {
void help(void);
// compare two strings, case insensitive
int compareCaseInsensitive(const std::string& s1, const std::string& s2);
// is the head of the string equal to the substring? case insensitive
bool isHeadCaseInsensitive(const std::string& s1, const std::string& s2);
// check whether an (absolute) URL name is local
bool isLocalURL(const std::string& theURL);
// check whether URL name promises to be a text file from extension
bool isTextOrDirURL(const std::string& theURL);
// get relative URL given the full one
std::string getRelativeURL(const std::string& theURL);
// retrieve the data, driver
void retrieveData(std::string URL, int maxDepth, Digraph& G,
bool localOnly, bool globalOnly, bool verbose);
// retrieve the data, recursive
void retrieveData(std::string URL, int maxDepth, Digraph& G,
bool localOnly, bool globalOnly, bool verbose,
int currentDepth, VertexURL* vParent);
extern const std::string protocolTag;
};
#endif
/*******************************************************
** Name: wet.cxx
** Author: Leo Liberti
** Source: GNU C++
** Purpose: www exploring topologizer - explore a
** neighbourhood of a given URL and display graph
** Test with
** ./wet http://www.enseignement.polytechnique.fr/profs/ \
informatique/Leo.Liberti/test.html 3
** History: 060820 work started
*******************************************************/
#include<iostream>
#include<string>
#include<cstring>
#include "wet.h"
namespace WET {
enum exitCodes { exitNormal, exitError };
const char charDot = '.';
const char charSlash = '/';
const std::string localPrefix1 = "http://127.0.0.1";
const std::string localPrefix2 = "http://localhost";
const std::string htmlExtension1 = "html";
const std::string htmlExtension2 = "htm";
const int lastFewCharsSize = 5;
};
void WET::help(void) {
using namespace std;
cerr << "wet (WWW Exploring Topologizer), Leo Liberti 2006" << endl;
cerr << "Syntax: wet [options] URL maxDepth" << endl;
cerr << " Retrieves neighbourhood of radius maxDepth starting with URL"
<< endl;
cerr << " -h display this help" << endl;
cerr << " -v verbose progress output on stderr" << endl;
cerr << " -l limits exploration to local downloads" << endl;
cerr << " -g limits exploration to global downloads" << endl;
}
int WET::compareCaseInsensitive(const std::string& s1, const std::string& s2) {
using namespace std;
string::const_iterator p1 = s1.begin();
string::const_iterator p2 = s2.begin();
while(p1 != s1.end() && p2 != s2.end()) {
if (toupper(*p1) < toupper(*p2)) {
return -1;
} else if (toupper(*p1) > toupper(*p2)) {
return 1;
}
p1++;
p2++;
}
if (s1.size() < s2.size()) {
return -1;
} else if (s1.size() > s2.size()) {
return 1;
}
return 0;
}
bool WET::isHeadCaseInsensitive(const std::string& s1, const std::string& s2) {
using namespace std;
int s2len = s2.size();
if (s1.size() >= s2.size() &&
WET::compareCaseInsensitive(s1.substr(0, s2len), s2) == 0) {
return true;
}
return false;
}
bool WET::isLocalURL(const std::string& theURL) {
using namespace std;
using namespace WET;
return isHeadCaseInsensitive(theURL, localPrefix1) ||
isHeadCaseInsensitive(theURL, localPrefix2);
}
std::string WET::getRelativeURL(const std::string& theURL) {
using namespace std;
string ret;
if (!isHeadCaseInsensitive(theURL, protocolTag)) {
ret = theURL;
return ret;
}
string tmp = theURL.substr(protocolTag.size() + 1);
int lastSlashPos = tmp.find(charSlash);
if (lastSlashPos == tmp.npos) {
return ret;
}
ret = tmp.substr(tmp.find(charSlash) + 1);
return ret;
}
bool WET::isTextOrDirURL(const std::string& theURL) {
using namespace std;
// jump over three slashes
string relativeURL = getRelativeURL(theURL);
// get the last few characters
string lastFewChars;
if (relativeURL.size() > lastFewCharsSize) {
lastFewChars = relativeURL.substr(relativeURL.size() - lastFewCharsSize);
} else {
lastFewChars = relativeURL;
}
// get the extension
string extension = lastFewChars.substr(lastFewChars.rfind(charDot)+1);
if (extension.size() == 0) {
return true;
}
if (compareCaseInsensitive(extension, htmlExtension1) == 0 ||
compareCaseInsensitive(extension, htmlExtension2) == 0) {
return true;
}
return false;
}
void WET::retrieveData(std::string theURL, int maxDepth, Digraph& G,
bool localOnly, bool globalOnly, bool verbose) {
TimeStamp ts;
ts.update();
G.setTimeStamp(ts);
// notice that theURL must be an absolute URL
WET::retrieveData(theURL, maxDepth, G, localOnly, globalOnly,
verbose, 0, NULL);
}
void WET::retrieveData(std::string URLName, int maxDepth, Digraph& G,
bool localOnly, bool globalOnly, bool verbose,
int currentDepth, VertexURL* vParent) {
using namespace std;
using namespace WET;
bool proceed = true;
if (verbose) {
cerr << "wet: " << URLName;
}
// checks for proceeding to store url as vertex
if (currentDepth >= maxDepth) {
// check that we are not exceeding recursion level
proceed = false;
}
if (localOnly && !isLocalURL(URLName)) {
// if local flag is set, check that URL is local
proceed = false;
}
if (globalOnly) {
// if global flag is set, check that URL is global: scan all vertices to
// see if one already contains the same hostname - if yes, ignore this
int vSize = G.getNumberOfVertices();
for(int i = 0; i < vSize; i++) {
Vertex& theVertex = G.getVertex(i);
VertexURL* theVtxURL = dynamic_cast<VertexURL*>(&theVertex);
URL& theURL = theVtxURL->getURL();
string hostName = theURL.getHostName();
if (URLName.find(hostName) != URLName.npos) {
// found
proceed = false;
break;
}
}
}
if (!isTextOrDirURL(URLName)) {
proceed = false;
}
// check that this new URLName was not already downloaded at some point
int vSize = G.getNumberOfVertices();
for(int i = 0; i < vSize; i++) {
Vertex& theVertex = G.getVertex(i);
VertexURL* theVtxURL = dynamic_cast<VertexURL*>(&theVertex);
URL& theURL = theVtxURL->getURL();
string secondURL = theURL.getURLName();
if (URLName == secondURL) {
// vertex already exists, just add an arc vParent -> theVertex
int theID = theVertex.getID();
if (vParent) {
vParent->addAdjacentVertexID(theID);
}
Arc* arcPtr = new Arc(vParent->getID(), theID);
G.addArc(*arcPtr);
proceed = false;
break;
}
}
// do it
if (proceed) {
if (verbose) {
cerr << ": downloading" << endl;
}
URL* myURLPtr = new URL(URLName);
myURLPtr->download();
// get next available vertex ID
int theID = G.getNumberOfVertices();
VertexURL* vtxPtr = new VertexURL(theID, myURLPtr);
// vertex pointers are only stored in the graph
// (where VertexURL objects are stored as virtual base Vertex objects)
G.addVertex(*vtxPtr);
if (vParent) {
// add the vertex ID to the adjacency list of the parent vertex
vParent->addAdjacentVertexID(theID);
// arc pointers are only stored in the graph
Arc* arcPtr = new Arc(vParent->getID(), theID);
G.addArc(*arcPtr);
}
// get next link
string theLink = myURLPtr->getNextLink();
while(theLink.size() > 0) {
// if there is a next link, recurse
retrieveData(theLink, maxDepth, G, localOnly, globalOnly, verbose,
currentDepth+1, vtxPtr);
theLink = myURLPtr->getNextLink();
}
} else if (verbose) {
cerr << endl;
}
}
int main(int argc, char** argv) {
using namespace std;
using namespace WET;
int ret = 0;
bool localOnly = false;
bool globalOnly = false;
bool verbose = false;
// main input data fields
string URLName;
int maxDepth;
string timeStampName;
// read options
if (argc < 2 || strncmp(argv[1], "-h", 2) == 0) {
help();
exit(exitNormal);
}
int theArgCounter = 1;
while(argv[theArgCounter][0] == '-') {
// read options
if (strncmp(argv[theArgCounter], "-l", 2) == 0) {
localOnly = true;
} else if (strncmp(argv[theArgCounter], "-g", 2) == 0) {
globalOnly = true;
} else if (strncmp(argv[theArgCounter], "-v", 2) == 0) {
verbose = true;
}
theArgCounter++;
if (theArgCounter == argc) {
cerr << "wet: error: after options, cmd line requires URL and maxDepth"
<< endl;
exit(exitError);
}
}
URLName = argv[theArgCounter];
if (URLName.find(WET::protocolTag) == URLName.npos) {
URLName = WET::protocolTag + URLName;
}
theArgCounter++;
if (theArgCounter == argc) {
cerr << "wet: error: after options and URL, cmd line requires maxDepth"
<< endl;
exit(exitError);
}
maxDepth = atoi(argv[theArgCounter]);
// validate input data
if (maxDepth < 1) {
cerr << "wet: error: maxDepth must be strictly greater than 0" << endl;
exit(exitError);
}
// do it
Digraph G;
try {
retrieveData(URLName, maxDepth, G, localOnly, globalOnly, verbose);
cout << G;
} catch (TimeStampException) {
cerr << "wet: exception in TimeStamp object, aborting" << endl;
} catch (FileParserException) {
cerr << "wet: exception in FileParser object, aborting" << endl;
} catch (HTMLPageException) {
cerr << "wet: exception in HTMLPage object, aborting" << endl;
} catch (URLException) {
cerr << "wet: exception in URL object, aborting" << endl;
} catch (VertexURLException) {
cerr << "wet: exception in VertexURL object, aborting" << endl;
} catch (DigraphException) {
cerr << "wet: exception in Digraph object, aborting" << endl;
} catch (...) {
cerr << "wet: caught generic exception, aborting" << endl;
}
return ret;
}