/******************************************************* ** Name: wet.h ** Author: Leo Liberti ** Source: GNU C++ ** Purpose: www exploring topologizer - explore a ** neighbourhood of a given URL and store info, ** perform queries (header file) ** History: 060820 work started *******************************************************/ #ifndef _WETWETH #define _WETWETH #include<string> #include "timestamp.h" #include "fileparser.h" #include "htmlpage.h" #include "url.h" #include "vertex.h" #include "vertexurl.h" #include "digraph.h" namespace WET { void help(void); // compare two strings, case insensitive int compareCaseInsensitive(const std::string& s1, const std::string& s2); // is the head of the string equal to the substring? case insensitive bool isHeadCaseInsensitive(const std::string& s1, const std::string& s2); // check whether an (absolute) URL name is local bool isLocalURL(const std::string& theURL); // check whether URL name promises to be a text file from extension bool isTextOrDirURL(const std::string& theURL); // get relative URL given the full one std::string getRelativeURL(const std::string& theURL); // retrieve the data, driver void retrieveData(std::string URL, int maxDepth, Digraph& G, bool localOnly, bool globalOnly, bool verbose); // retrieve the data, recursive void retrieveData(std::string URL, int maxDepth, Digraph& G, bool localOnly, bool globalOnly, bool verbose, int currentDepth, VertexURL* vParent); extern const std::string protocolTag; }; #endif
/******************************************************* ** Name: wet.cxx ** Author: Leo Liberti ** Source: GNU C++ ** Purpose: www exploring topologizer - explore a ** neighbourhood of a given URL and display graph ** Test with ** ./wet http://www.enseignement.polytechnique.fr/profs/ \ informatique/Leo.Liberti/test.html 3 ** History: 060820 work started *******************************************************/ #include<iostream> #include<string> #include<cstring> #include "wet.h" namespace WET { enum exitCodes { exitNormal, exitError }; const char charDot = '.'; const char charSlash = '/'; const std::string localPrefix1 = "http://127.0.0.1"; const std::string localPrefix2 = "http://localhost"; const std::string htmlExtension1 = "html"; const std::string htmlExtension2 = "htm"; const int lastFewCharsSize = 5; }; void WET::help(void) { using namespace std; cerr << "wet (WWW Exploring Topologizer), Leo Liberti 2006" << endl; cerr << "Syntax: wet [options] URL maxDepth" << endl; cerr << " Retrieves neighbourhood of radius maxDepth starting with URL" << endl; cerr << " -h display this help" << endl; cerr << " -v verbose progress output on stderr" << endl; cerr << " -l limits exploration to local downloads" << endl; cerr << " -g limits exploration to global downloads" << endl; } int WET::compareCaseInsensitive(const std::string& s1, const std::string& s2) { using namespace std; string::const_iterator p1 = s1.begin(); string::const_iterator p2 = s2.begin(); while(p1 != s1.end() && p2 != s2.end()) { if (toupper(*p1) < toupper(*p2)) { return -1; } else if (toupper(*p1) > toupper(*p2)) { return 1; } p1++; p2++; } if (s1.size() < s2.size()) { return -1; } else if (s1.size() > s2.size()) { return 1; } return 0; } bool WET::isHeadCaseInsensitive(const std::string& s1, const std::string& s2) { using namespace std; int s2len = s2.size(); if (s1.size() >= s2.size() && WET::compareCaseInsensitive(s1.substr(0, s2len), s2) == 0) { return true; } return false; } bool WET::isLocalURL(const std::string& theURL) { using namespace std; using namespace WET; return isHeadCaseInsensitive(theURL, localPrefix1) || isHeadCaseInsensitive(theURL, localPrefix2); } std::string WET::getRelativeURL(const std::string& theURL) { using namespace std; string ret; if (!isHeadCaseInsensitive(theURL, protocolTag)) { ret = theURL; return ret; } string tmp = theURL.substr(protocolTag.size() + 1); int lastSlashPos = tmp.find(charSlash); if (lastSlashPos == tmp.npos) { return ret; } ret = tmp.substr(tmp.find(charSlash) + 1); return ret; } bool WET::isTextOrDirURL(const std::string& theURL) { using namespace std; // jump over three slashes string relativeURL = getRelativeURL(theURL); // get the last few characters string lastFewChars; if (relativeURL.size() > lastFewCharsSize) { lastFewChars = relativeURL.substr(relativeURL.size() - lastFewCharsSize); } else { lastFewChars = relativeURL; } // get the extension string extension = lastFewChars.substr(lastFewChars.rfind(charDot)+1); if (extension.size() == 0) { return true; } if (compareCaseInsensitive(extension, htmlExtension1) == 0 || compareCaseInsensitive(extension, htmlExtension2) == 0) { return true; } return false; } void WET::retrieveData(std::string theURL, int maxDepth, Digraph& G, bool localOnly, bool globalOnly, bool verbose) { TimeStamp ts; ts.update(); G.setTimeStamp(ts); // notice that theURL must be an absolute URL WET::retrieveData(theURL, maxDepth, G, localOnly, globalOnly, verbose, 0, NULL); } void WET::retrieveData(std::string URLName, int maxDepth, Digraph& G, bool localOnly, bool globalOnly, bool verbose, int currentDepth, VertexURL* vParent) { using namespace std; using namespace WET; bool proceed = true; if (verbose) { cerr << "wet: " << URLName; } // checks for proceeding to store url as vertex if (currentDepth >= maxDepth) { // check that we are not exceeding recursion level proceed = false; } if (localOnly && !isLocalURL(URLName)) { // if local flag is set, check that URL is local proceed = false; } if (globalOnly) { // if global flag is set, check that URL is global: scan all vertices to // see if one already contains the same hostname - if yes, ignore this int vSize = G.getNumberOfVertices(); for(int i = 0; i < vSize; i++) { Vertex& theVertex = G.getVertex(i); VertexURL* theVtxURL = dynamic_cast<VertexURL*>(&theVertex); URL& theURL = theVtxURL->getURL(); string hostName = theURL.getHostName(); if (URLName.find(hostName) != URLName.npos) { // found proceed = false; break; } } } if (!isTextOrDirURL(URLName)) { proceed = false; } // check that this new URLName was not already downloaded at some point int vSize = G.getNumberOfVertices(); for(int i = 0; i < vSize; i++) { Vertex& theVertex = G.getVertex(i); VertexURL* theVtxURL = dynamic_cast<VertexURL*>(&theVertex); URL& theURL = theVtxURL->getURL(); string secondURL = theURL.getURLName(); if (URLName == secondURL) { // vertex already exists, just add an arc vParent -> theVertex int theID = theVertex.getID(); if (vParent) { vParent->addAdjacentVertexID(theID); } Arc* arcPtr = new Arc(vParent->getID(), theID); G.addArc(*arcPtr); proceed = false; break; } } // do it if (proceed) { if (verbose) { cerr << ": downloading" << endl; } URL* myURLPtr = new URL(URLName); myURLPtr->download(); // get next available vertex ID int theID = G.getNumberOfVertices(); VertexURL* vtxPtr = new VertexURL(theID, myURLPtr); // vertex pointers are only stored in the graph // (where VertexURL objects are stored as virtual base Vertex objects) G.addVertex(*vtxPtr); if (vParent) { // add the vertex ID to the adjacency list of the parent vertex vParent->addAdjacentVertexID(theID); // arc pointers are only stored in the graph Arc* arcPtr = new Arc(vParent->getID(), theID); G.addArc(*arcPtr); } // get next link string theLink = myURLPtr->getNextLink(); while(theLink.size() > 0) { // if there is a next link, recurse retrieveData(theLink, maxDepth, G, localOnly, globalOnly, verbose, currentDepth+1, vtxPtr); theLink = myURLPtr->getNextLink(); } } else if (verbose) { cerr << endl; } } int main(int argc, char** argv) { using namespace std; using namespace WET; int ret = 0; bool localOnly = false; bool globalOnly = false; bool verbose = false; // main input data fields string URLName; int maxDepth; string timeStampName; // read options if (argc < 2 || strncmp(argv[1], "-h", 2) == 0) { help(); exit(exitNormal); } int theArgCounter = 1; while(argv[theArgCounter][0] == '-') { // read options if (strncmp(argv[theArgCounter], "-l", 2) == 0) { localOnly = true; } else if (strncmp(argv[theArgCounter], "-g", 2) == 0) { globalOnly = true; } else if (strncmp(argv[theArgCounter], "-v", 2) == 0) { verbose = true; } theArgCounter++; if (theArgCounter == argc) { cerr << "wet: error: after options, cmd line requires URL and maxDepth" << endl; exit(exitError); } } URLName = argv[theArgCounter]; if (URLName.find(WET::protocolTag) == URLName.npos) { URLName = WET::protocolTag + URLName; } theArgCounter++; if (theArgCounter == argc) { cerr << "wet: error: after options and URL, cmd line requires maxDepth" << endl; exit(exitError); } maxDepth = atoi(argv[theArgCounter]); // validate input data if (maxDepth < 1) { cerr << "wet: error: maxDepth must be strictly greater than 0" << endl; exit(exitError); } // do it Digraph G; try { retrieveData(URLName, maxDepth, G, localOnly, globalOnly, verbose); cout << G; } catch (TimeStampException) { cerr << "wet: exception in TimeStamp object, aborting" << endl; } catch (FileParserException) { cerr << "wet: exception in FileParser object, aborting" << endl; } catch (HTMLPageException) { cerr << "wet: exception in HTMLPage object, aborting" << endl; } catch (URLException) { cerr << "wet: exception in URL object, aborting" << endl; } catch (VertexURLException) { cerr << "wet: exception in VertexURL object, aborting" << endl; } catch (DigraphException) { cerr << "wet: exception in Digraph object, aborting" << endl; } catch (...) { cerr << "wet: caught generic exception, aborting" << endl; } return ret; }