/*************************************************************** ** Name: url.h ** Author: Leo Liberti ** Source: GNU C++ ** Purpose: www exploring topologizer - URL data type (header) ** History: 060820 work started ****************************************************************/ #ifndef _WETURLH #define _WETURLH #include<string> #include "htmlpage.h" class URLException { public: URLException(); ~URLException(); }; class URL { public: URL(); URL(std::string theURL); ~URL(); std::string getURLName(void) const; void setURLName(std::string theURL); std::string getFileName(void) const; // download the URL void download(void) throw(URLException); // return the HTTP links in the URL std::string getNextLink(void) throw(URLException); // return the hostname in the URL std::string getHostName(void); private: HTMLPage* htmlPagePtr; std::string URLName; std::string fileName; int linkCounter; }; #endif
/*************************************************************** ** Name: url.cxx ** Author: Leo Liberti ** Source: GNU C++ ** Purpose: www exploring topologizer - URL data type ** History: 060820 work started ****************************************************************/ #include<iostream> #include<string> #include<cstdlib> #include<algorithm> #include<unistd.h> #include "url.h" #include "wet.h" namespace WET { const char charSlash = '/'; const char charQuote = '\"'; const std::string protocolTag = "http://"; const std::string theCommand = "wget --timeout=10 --quiet -O "; const std::string stdFileName = ".wet.html"; }; URLException::URLException() { } URLException::~URLException() { } URL::URL() : htmlPagePtr(NULL), linkCounter(0) { } URL::URL(std::string theURL) : URLName(theURL), htmlPagePtr(NULL), linkCounter(0) { } URL::~URL() { if (htmlPagePtr) { delete htmlPagePtr; #ifdef DEBUG std::cerr << "** destroying URL " << this << " and htmlPagePtr " << htmlPagePtr << std::endl; } else { std::cerr << "** destroying URL " << this << std::endl; #endif } } std::string URL::getURLName(void) const { return URLName; } void URL::setURLName(std::string theURL) { URLName = theURL; } std::string URL::getFileName(void) const { return fileName; } void URL::download(void) throw(URLException) { using namespace std; using namespace WET; if (URLName == "") { cerr << "URL::download(): URLName not initialized" << endl; throw URLException(); } unlink(stdFileName.c_str()); string cmd = theCommand + stdFileName + " " + charQuote + URLName + charQuote; // spawn the command #ifdef DEBUG cerr << "** URL: exec " << cmd << endl; #endif int status = system(cmd.c_str()); // create the HTML page htmlPagePtr = new HTMLPage(stdFileName, URLName); // delete the filename now unlink(stdFileName.c_str()); } std::string URL::getNextLink(void) throw(URLException) { using namespace std; if (!htmlPagePtr) { cerr << "URL::getNextLink(): uninitialized htmlPagePtr" << endl; throw URLException(); } string ret; if (linkCounter == htmlPagePtr->getNumberOfLinks()) { linkCounter = 0; return ret; } ret = htmlPagePtr->getLink(linkCounter); linkCounter++; return ret; } std::string URL::getHostName(void) { using namespace std; string::size_type p = URLName.find(WET::protocolTag); string::size_type from, to, length; if (p == URLName.npos) { // not found from = 0; } else { from = WET::protocolTag.size(); } to = URLName.find(WET::charSlash, from); if (to == URLName.npos) { length = URLName.size(); } else { length = to - from; } string ret = URLName.substr(from, length); return ret; }