/***************************************************************
** Name: url.h
** Author: Leo Liberti
** Source: GNU C++
** Purpose: www exploring topologizer - URL data type (header)
** History: 060820 work started
****************************************************************/
#ifndef _WETURLH
#define _WETURLH
#include<string>
#include "htmlpage.h"
class URLException {
public:
URLException();
~URLException();
};
class URL {
public:
URL();
URL(std::string theURL);
~URL();
std::string getURLName(void) const;
void setURLName(std::string theURL);
std::string getFileName(void) const;
// download the URL
void download(void) throw(URLException);
// return the HTTP links in the URL
std::string getNextLink(void) throw(URLException);
// return the hostname in the URL
std::string getHostName(void);
private:
HTMLPage* htmlPagePtr;
std::string URLName;
std::string fileName;
int linkCounter;
};
#endif
/***************************************************************
** Name: url.cxx
** Author: Leo Liberti
** Source: GNU C++
** Purpose: www exploring topologizer - URL data type
** History: 060820 work started
****************************************************************/
#include<iostream>
#include<string>
#include<cstdlib>
#include<algorithm>
#include<unistd.h>
#include "url.h"
#include "wet.h"
namespace WET {
const char charSlash = '/';
const char charQuote = '\"';
const std::string protocolTag = "http://";
const std::string theCommand = "wget --timeout=10 --quiet -O ";
const std::string stdFileName = ".wet.html";
};
URLException::URLException() { }
URLException::~URLException() { }
URL::URL() : htmlPagePtr(NULL), linkCounter(0) { }
URL::URL(std::string theURL) :
URLName(theURL), htmlPagePtr(NULL), linkCounter(0) { }
URL::~URL() {
if (htmlPagePtr) {
delete htmlPagePtr;
#ifdef DEBUG
std::cerr << "** destroying URL " << this
<< " and htmlPagePtr " << htmlPagePtr << std::endl;
} else {
std::cerr << "** destroying URL " << this << std::endl;
#endif
}
}
std::string URL::getURLName(void) const {
return URLName;
}
void URL::setURLName(std::string theURL) {
URLName = theURL;
}
std::string URL::getFileName(void) const {
return fileName;
}
void URL::download(void) throw(URLException) {
using namespace std;
using namespace WET;
if (URLName == "") {
cerr << "URL::download(): URLName not initialized" << endl;
throw URLException();
}
unlink(stdFileName.c_str());
string cmd = theCommand + stdFileName + " " +
charQuote + URLName + charQuote;
// spawn the command
#ifdef DEBUG
cerr << "** URL: exec " << cmd << endl;
#endif
int status = system(cmd.c_str());
// create the HTML page
htmlPagePtr = new HTMLPage(stdFileName, URLName);
// delete the filename now
unlink(stdFileName.c_str());
}
std::string URL::getNextLink(void) throw(URLException) {
using namespace std;
if (!htmlPagePtr) {
cerr << "URL::getNextLink(): uninitialized htmlPagePtr" << endl;
throw URLException();
}
string ret;
if (linkCounter == htmlPagePtr->getNumberOfLinks()) {
linkCounter = 0;
return ret;
}
ret = htmlPagePtr->getLink(linkCounter);
linkCounter++;
return ret;
}
std::string URL::getHostName(void) {
using namespace std;
string::size_type p = URLName.find(WET::protocolTag);
string::size_type from, to, length;
if (p == URLName.npos) {
// not found
from = 0;
} else {
from = WET::protocolTag.size();
}
to = URLName.find(WET::charSlash, from);
if (to == URLName.npos) {
length = URLName.size();
} else {
length = to - from;
}
string ret = URLName.substr(from, length);
return ret;
}