Solution

/***************************************************************
** Name:        htmlpage.h
** Author:      Leo Liberti
** Source:      GNU C++
** Purpose:     www exploring topologizer - HTML page handler (header)
** History:     060820 work started
****************************************************************/

#ifndef _WETHTMLPAGEH
#define _WETHTMLPAGEH

#include "fileparser.h"
#include<string>
#include<vector>

class HTMLPageException {
 public:
  HTMLPageException();
  ~HTMLPageException();
};

class HTMLPage : public FileParser {
 public:
  HTMLPage(std::string theFileName, std::string theURL) 
    throw(HTMLPageException);
  ~HTMLPage();  
  
  int getNumberOfLinks(void) const;
  std::string getLink(int i) const throw(HTMLPageException);

 private:
  // check whether a given link is an HTTP link
  bool isHTTP(const std::string& theLink) const;

  // construct an absolute URL from a relative one
  std::string makeAbsoluteURL(const std::string& theLink) const;

  std::string htmlFileName;
  std::string URL;
  std::string URLPrefix;
  std::vector<std::string> link;
};

#endif

/***************************************************************
** Name:        htmlpage.cxx
** Author:      Leo Liberti
** Source:      GNU C++
** Purpose:     www exploring topologizer - HTML page handler 
** History:     060820 work started
****************************************************************/

#include "htmlpage.h"
#include "wet.h"
#include<iostream>
#include<algorithm>

namespace WET {
  const char charColon = ':';
  const char charSlash = '/';
  const char charDot = '.';
  const char charHash = '#';
  const std::string httpProtocolName = "http";
};

HTMLPageException::HTMLPageException() { }
HTMLPageException::~HTMLPageException() { }

HTMLPage::HTMLPage(std::string theFileName, std::string theURL) 
  throw(HTMLPageException) : 
  htmlFileName(theFileName), URL(theURL), FileParser(theFileName, "HREF") {

  using namespace std;
  using namespace WET;

  // check the URL is well formed
  if (URL.find(charColon) == URL.npos) {
    // no colon, URL is not absolute
    cerr << "HTMLPage::HTMLPage(): " << URL << " is not absolute" << endl;
    throw HTMLPageException();
  }
  if (count(URL.begin(), URL.end(), charSlash) < 3) {
    // URL is "http://site.name"
    URLPrefix = URL + charSlash;
  } else {
    // find the URL prefix (ending with '/');
    URLPrefix = URL.substr(0, URL.rfind(charSlash) + 1);
  }

  // call the inherited fileParser::parse() method to parse the file
  parse();
  int n = getNumberOfParsedStrings();
  string theLink;
  for(int i = 0; i < n; i++) {
    theLink = getParsedString(i);
    if (isHTTP(theLink)) {
      // save the absolute version of the link only if it's an HTTP link
      theLink = makeAbsoluteURL(theLink);
      link.push_back(theLink);
    }
  }
}

HTMLPage::~HTMLPage() { 
#ifdef DEBUG
  std::cerr << "** destroying HTMLPage " << this << std::endl; 
#endif
}

int HTMLPage::getNumberOfLinks(void) const {
  return link.size();
}

std::string HTMLPage::getLink(int i) const throw(HTMLPageException) {
  using namespace std;
  if (i < 0 || i >= link.size()) {
    cerr << "HTMLPage::getLink(" << i << "): counter out of bounds" << endl;
    throw HTMLPageException();
  }
  return link[i];
}

bool HTMLPage::isHTTP(const std::string& theLink) const {
  using namespace std;
  using namespace WET;

  if (theLink[0] == charHash) {
    // url is "#bookmark", not a valid url as it refers to same page
    return false;
  }
  int colonpos = theLink.find(charColon);
  if (colonpos == theLink.npos) {
    // no ':' char in link, relative URL, ok
    return true;
  }
  if (compareCaseInsensitive(theLink.substr(0, colonpos), 
                             httpProtocolName) == 0) {
    return true;
  }
  return false;
}

std::string HTMLPage::makeAbsoluteURL(const std::string& theLink) const {
  using namespace std;
  using namespace WET;
  string ret;
  string::size_type colon = theLink.find(charColon);
  if (colon < theLink.npos && 
      theLink[colon + 1] == charSlash && theLink[colon + 2] == charSlash) {
    // link is absolute, return
    return theLink;
  }
  if (theLink[0] == charSlash) {
    // first character is a slash, the prefix has to be shortened
    // find the third occurrence of '/' in the URLPrefix (http://.../...)
    int thePos = URLPrefix.find(charSlash);
    thePos = URLPrefix.find(charSlash, thePos + 2);
    ret = URLPrefix.substr(0, thePos) + theLink;
  } else {
    ret = URLPrefix + theLink;
  }
  return ret;
}

Leo Liberti 2008-01-12