/*************************************************************** ** Name: htmlpage.h ** Author: Leo Liberti ** Source: GNU C++ ** Purpose: www exploring topologizer - HTML page handler (header) ** History: 060820 work started ****************************************************************/ #ifndef _WETHTMLPAGEH #define _WETHTMLPAGEH #include "fileparser.h" #include<string> #include<vector> class HTMLPageException { public: HTMLPageException(); ~HTMLPageException(); }; class HTMLPage : public FileParser { public: HTMLPage(std::string theFileName, std::string theURL) throw(HTMLPageException); ~HTMLPage(); int getNumberOfLinks(void) const; std::string getLink(int i) const throw(HTMLPageException); private: // check whether a given link is an HTTP link bool isHTTP(const std::string& theLink) const; // construct an absolute URL from a relative one std::string makeAbsoluteURL(const std::string& theLink) const; std::string htmlFileName; std::string URL; std::string URLPrefix; std::vector<std::string> link; }; #endif
/*************************************************************** ** Name: htmlpage.cxx ** Author: Leo Liberti ** Source: GNU C++ ** Purpose: www exploring topologizer - HTML page handler ** History: 060820 work started ****************************************************************/ #include "htmlpage.h" #include "wet.h" #include<iostream> #include<algorithm> namespace WET { const char charColon = ':'; const char charSlash = '/'; const char charDot = '.'; const char charHash = '#'; const std::string httpProtocolName = "http"; }; HTMLPageException::HTMLPageException() { } HTMLPageException::~HTMLPageException() { } HTMLPage::HTMLPage(std::string theFileName, std::string theURL) throw(HTMLPageException) : htmlFileName(theFileName), URL(theURL), FileParser(theFileName, "HREF") { using namespace std; using namespace WET; // check the URL is well formed if (URL.find(charColon) == URL.npos) { // no colon, URL is not absolute cerr << "HTMLPage::HTMLPage(): " << URL << " is not absolute" << endl; throw HTMLPageException(); } if (count(URL.begin(), URL.end(), charSlash) < 3) { // URL is "http://site.name" URLPrefix = URL + charSlash; } else { // find the URL prefix (ending with '/'); URLPrefix = URL.substr(0, URL.rfind(charSlash) + 1); } // call the inherited fileParser::parse() method to parse the file parse(); int n = getNumberOfParsedStrings(); string theLink; for(int i = 0; i < n; i++) { theLink = getParsedString(i); if (isHTTP(theLink)) { // save the absolute version of the link only if it's an HTTP link theLink = makeAbsoluteURL(theLink); link.push_back(theLink); } } } HTMLPage::~HTMLPage() { #ifdef DEBUG std::cerr << "** destroying HTMLPage " << this << std::endl; #endif } int HTMLPage::getNumberOfLinks(void) const { return link.size(); } std::string HTMLPage::getLink(int i) const throw(HTMLPageException) { using namespace std; if (i < 0 || i >= link.size()) { cerr << "HTMLPage::getLink(" << i << "): counter out of bounds" << endl; throw HTMLPageException(); } return link[i]; } bool HTMLPage::isHTTP(const std::string& theLink) const { using namespace std; using namespace WET; if (theLink[0] == charHash) { // url is "#bookmark", not a valid url as it refers to same page return false; } int colonpos = theLink.find(charColon); if (colonpos == theLink.npos) { // no ':' char in link, relative URL, ok return true; } if (compareCaseInsensitive(theLink.substr(0, colonpos), httpProtocolName) == 0) { return true; } return false; } std::string HTMLPage::makeAbsoluteURL(const std::string& theLink) const { using namespace std; using namespace WET; string ret; string::size_type colon = theLink.find(charColon); if (colon < theLink.npos && theLink[colon + 1] == charSlash && theLink[colon + 2] == charSlash) { // link is absolute, return return theLink; } if (theLink[0] == charSlash) { // first character is a slash, the prefix has to be shortened // find the third occurrence of '/' in the URLPrefix (http://.../...) int thePos = URLPrefix.find(charSlash); thePos = URLPrefix.find(charSlash, thePos + 2); ret = URLPrefix.substr(0, thePos) + theLink; } else { ret = URLPrefix + theLink; } return ret; }