/***************************************************************
** Name: htmlpage.h
** Author: Leo Liberti
** Source: GNU C++
** Purpose: www exploring topologizer - HTML page handler (header)
** History: 060820 work started
****************************************************************/
#ifndef _WETHTMLPAGEH
#define _WETHTMLPAGEH
#include "fileparser.h"
#include<string>
#include<vector>
class HTMLPageException {
public:
HTMLPageException();
~HTMLPageException();
};
class HTMLPage : public FileParser {
public:
HTMLPage(std::string theFileName, std::string theURL)
throw(HTMLPageException);
~HTMLPage();
int getNumberOfLinks(void) const;
std::string getLink(int i) const throw(HTMLPageException);
private:
// check whether a given link is an HTTP link
bool isHTTP(const std::string& theLink) const;
// construct an absolute URL from a relative one
std::string makeAbsoluteURL(const std::string& theLink) const;
std::string htmlFileName;
std::string URL;
std::string URLPrefix;
std::vector<std::string> link;
};
#endif
/***************************************************************
** Name: htmlpage.cxx
** Author: Leo Liberti
** Source: GNU C++
** Purpose: www exploring topologizer - HTML page handler
** History: 060820 work started
****************************************************************/
#include "htmlpage.h"
#include "wet.h"
#include<iostream>
#include<algorithm>
namespace WET {
const char charColon = ':';
const char charSlash = '/';
const char charDot = '.';
const char charHash = '#';
const std::string httpProtocolName = "http";
};
HTMLPageException::HTMLPageException() { }
HTMLPageException::~HTMLPageException() { }
HTMLPage::HTMLPage(std::string theFileName, std::string theURL)
throw(HTMLPageException) :
htmlFileName(theFileName), URL(theURL), FileParser(theFileName, "HREF") {
using namespace std;
using namespace WET;
// check the URL is well formed
if (URL.find(charColon) == URL.npos) {
// no colon, URL is not absolute
cerr << "HTMLPage::HTMLPage(): " << URL << " is not absolute" << endl;
throw HTMLPageException();
}
if (count(URL.begin(), URL.end(), charSlash) < 3) {
// URL is "http://site.name"
URLPrefix = URL + charSlash;
} else {
// find the URL prefix (ending with '/');
URLPrefix = URL.substr(0, URL.rfind(charSlash) + 1);
}
// call the inherited fileParser::parse() method to parse the file
parse();
int n = getNumberOfParsedStrings();
string theLink;
for(int i = 0; i < n; i++) {
theLink = getParsedString(i);
if (isHTTP(theLink)) {
// save the absolute version of the link only if it's an HTTP link
theLink = makeAbsoluteURL(theLink);
link.push_back(theLink);
}
}
}
HTMLPage::~HTMLPage() {
#ifdef DEBUG
std::cerr << "** destroying HTMLPage " << this << std::endl;
#endif
}
int HTMLPage::getNumberOfLinks(void) const {
return link.size();
}
std::string HTMLPage::getLink(int i) const throw(HTMLPageException) {
using namespace std;
if (i < 0 || i >= link.size()) {
cerr << "HTMLPage::getLink(" << i << "): counter out of bounds" << endl;
throw HTMLPageException();
}
return link[i];
}
bool HTMLPage::isHTTP(const std::string& theLink) const {
using namespace std;
using namespace WET;
if (theLink[0] == charHash) {
// url is "#bookmark", not a valid url as it refers to same page
return false;
}
int colonpos = theLink.find(charColon);
if (colonpos == theLink.npos) {
// no ':' char in link, relative URL, ok
return true;
}
if (compareCaseInsensitive(theLink.substr(0, colonpos),
httpProtocolName) == 0) {
return true;
}
return false;
}
std::string HTMLPage::makeAbsoluteURL(const std::string& theLink) const {
using namespace std;
using namespace WET;
string ret;
string::size_type colon = theLink.find(charColon);
if (colon < theLink.npos &&
theLink[colon + 1] == charSlash && theLink[colon + 2] == charSlash) {
// link is absolute, return
return theLink;
}
if (theLink[0] == charSlash) {
// first character is a slash, the prefix has to be shortened
// find the third occurrence of '/' in the URLPrefix (http://.../...)
int thePos = URLPrefix.find(charSlash);
thePos = URLPrefix.find(charSlash, thePos + 2);
ret = URLPrefix.substr(0, thePos) + theLink;
} else {
ret = URLPrefix + theLink;
}
return ret;
}