RobotsTxt::RobotsTxt()
{
}
RobotsTxt::~RobotsTxt()
{
}
threads/downloadingthread. h
#ifndef DOWNLOADINGTHREAD_H
#define DOWNLOADINGTHREAD_H
#include "includes. h"
#include "rccbasethread. h"
class DownloadingThread : public RCCBaseThread
{
Q_OBJECT
public:
DownloadingThread(DataManager* pdM);
virtual ~DownloadingThread();
QNetworkAccessManager *pnam;
};
#endif // DOWNLOADINGTHREAD_H
threads/ downloadingthread. cpp
#include "downloadingthread. h"
DownloadingThread::DownloadingThread(DataManager* pdM):
RCCBaseThread(pdM)
{
pnam = new QNetworkAccessManager(this);
}
DownloadingThread::~DownloadingThread()
{
pnam->deleteLater();
}
threads/dt0.h
#ifndef DOWNLOADINGTHREAD0_H
#define DOWNLOADINGTHREAD0_H
#include "downloadingthread. h"
#include "delayedpdpass. h"
#include "data_structures/hostdownloaddata. h"
class DT0 : public DownloadingThread
{
Q_OBJECT
public:
DT0(DataManager* pdM);
virtual ~DT0();
protected:
virtual void run();
void _download(PageData &pD);
QMap<QNetworkReply*, PageData> _replyPDLinks;
int _pdChunkSizeMultiplier;
PDContainer _outPDs;
QMap<HostData*, HostDownloadData> _hDDs;
bool _saveDuplicates;
protected slots:
void slotFinished(QNetworkReply* preply);
void slotError(QNetworkReply* preply);
void slotDownloadProgress(qint64 bytesReceived, qint64 bytesTotal);
void slotPDCameOnDownload(PageData pD);
signals:
void errorSignal(QNetworkReply* preply);
void sleepSignal();
};
#endif // DOWNLOADINGTHREAD0_H
threads/ dt0.cpp
#include "dt0.h"
DT0::DT0(DataManager* pdM):
DownloadingThread(pdM)
{
RCCSettings *psett = RCCSettings::instance();
_pdChunkSizeMultiplier = psett->value( "DownloadingThreadPDChunkSizeMultiplier", "r").toInt();
_sleepTime = psett->value( "DownloadingThreadSleepTime", "r").toInt();
_saveDuplicates = psett->value( "SaveDuplicates", "r").toBool();
connect(pnam, SIGNAL(finished(QNetworkReply*)), this, SLOT(slotFinished(QNetworkReply*)));
connect(this, SIGNAL(errorSignal(QNetworkReply*)), SLOT(slotError(QNetworkReply*)));
}
DT0::~DT0()
{
}
void DT0::run()
{
while(true)
{
if (stop)
break;
QCoreApplication::processEvents();
QVector<HostData*> phosts = _pdM->getHosts();
for (HostData *phD: phosts)
{
QMap<HostData*, HostDownloadData>::iterator hDDsIt;
if (!_hDDs. contains(phD))
{
hDDsIt = _hDDs. insert(phD, HostDownloadData());
_pdM->getFreePDsForDownloading(hDDsIt->pDs, phD,_pdChunkSizeMultiplier);
hDDsIt->pDsIt = hDDsIt->pDs. begin();
}
else
hDDsIt = _hDDs. find(phD);
if (hDDsIt->pDs. end() == hDDsIt->pDsIt)
{
_pdM->getFreePDsForDownloading(hDDsIt->pDs, phD, _pdChunkSizeMultiplier);
hDDsIt->pDsIt = hDDsIt->pDs. begin();
}
while (hDDsIt->curDownloadsCount < phD->maxDownloadsAtTime
&& hDDsIt->pDs. end() != hDDsIt->pDsIt)
{
hDDsIt->curDownloadsCount++;
int downloadDelay = phD->crawlDelay;
if (!downloadDelay)
{
_download(*hDDsIt->pDsIt);
}
else
{
DelayedPDPass *pdPDP = new DelayedPDPass(*hDDsIt->pDsIt, downloadDelay, this);
connect(pdPDP, SIGNAL(pDIsReady(PageData)),
this, SLOT(slotPDCameOnDownload(PageData)));
}
hDDsIt->pDsIt++;
}
}
if (_outPDs. size() >= _pdChunkSizeMultiplier*2)
{
_pdM->updatePDs(_outPDs);
_outPDs. clear();
}
else
{
bool noDataForSomeOfHosts = false;
for (HostData *phD: phosts)
{
QMap<HostData*, HostDownloadData>::iterator hDDsIt = _hDDs. find(phD);
if(hDDsIt->pDs. end() == hDDsIt->pDsIt)
{
noDataForSomeOfHosts = true;
break;
}
}
if (noDataForSomeOfHosts)
{
_pdM->updatePDs(_outPDs);
_outPDs. clear();
}
}
if (_sleepTime)
msleep(_sleepTime);
}
}
void DT0::_download(PageData &pD)
{
pD. downloadAttempts++;
QString url = pD. phD->str + pD. normalizedUrl;
QNetworkRequest request(url);
request. setRawHeader("User-Agent", RCCConsts::USER_AGENT);
QNetworkReply *preply = pnam->get(request);
connect(preply, SIGNAL(downloadProgress(qint64,qint64)),
this, SLOT(slotDownloadProgress(qint64,qint64)));
_replyPDLinks. insert(preply, pD);
}
void DT0::slotFinished(QNetworkReply *preply)
{
PageData pD = _replyPDLinks. value(preply);
QMap<HostData*, HostDownloadData>::iterator hDDsIt = _hDDs. find(pD. phD);
hDDsIt->curDownloadsCount--;
QNetworkReply::NetworkError error = preply->error();
if (error!= QNetworkReply::NoError)
{
if ((error == QNetworkReply::ContentNotFoundError
|| error == QNetworkReply::ContentAccessDenied
|| error == QNetworkReply::ContentOperationNotPermittedError)
|| pD. downloadAttempts > RCCConsts::MAX_DOWNLOAD_ATTEMPTS)
{
emit errorSignal(preply);
return;
}
else
{
_replyPDLinks. remove(preply);
_outPDs. append(pD);
switch(error)
{
case QNetworkReply::ServiceUnavailableError:
_pdM->insertLogItem(DataManager::CommonLog,
PDLogItem(
QString("Download will be attempted again (Service anavailable). Was ")
+ QString::number(pD. downloadAttempts) + QString(" attempts: "), pD));
break;
}
return;
}
}
int statusCode = preply->attribute(QNetworkRequest::HttpStatusCodeAttribute).toInt();
if(statusCode == 301 || statusCode == 302 || statusCode == 303)
{
_replyPDLinks. remove(preply);
QUrl redirectUrl = preply->attribute(QNetworkRequest::RedirectionTargetAttribute).toUrl();
preply->deleteLater();
_pdM->insertLogItem(DataManager::CommonLog,
PDLogItem("Redirected to "+redirectUrl. toString()+": ", pD));
if (((pD. phD->host == redirectUrl. host() && pD. phD->protocol == redirectUrl. scheme())
|| redirectUrl. isRelative())
&& pD. downloadAttempts <= RCCConsts::MAX_DOWNLOAD_ATTEMPTS)
{
pD. downloadAttempts++;
QString query = redirectUrl. query();
QString redirectUrlStr = redirectUrl. path();
if (query. length())
redirectUrlStr += "?" + redirectUrl. query();
pD. url = redirectUrlStr;
pD. normalizedUrl = PageData::normalizeUrl(redirectUrlStr);
_outPDs. append(pD);
}
else
{
pD. remove = true;
_outPDs. append(pD);
}
return;
}
_replyPDLinks. remove(preply);
_pdM->insertLogItem(DataManager::CommonLog, PDLogItem("Downloaded: ", pD));
QString content = QString(preply->readAll());
std::pair<bool, PageData> contentSeen = _pdM->contentSeen(pD. phD, content);
if (!contentSeen. first)
{
pD. content = content;
pD. contentHash = PageData::hashContent(pD. content);
pD. downloaded = true;
}
else
{
pD. replaceId =contentSeen. second. id;
if (_saveDuplicates)
{
pD. errorCode = RCCConsts::CONTENT_DUPLICATE;
pD. downloaded = true;
}
else
{
pD. remove = true;
}
}
_outPDs. append(pD);
preply->deleteLater();
}
void DT0::slotError(QNetworkReply *preply)
{
PageData pd = _replyPDLinks. take(preply);
pd. errorCode = preply->error();
pd. downloaded = true;
_outPDs. append(pd);
_pdM->insertLogItem(DataManager::ErrorLog,
PDLogItem(preply->errorString(), pd));
preply->deleteLater();
}
void DT0::slotDownloadProgress(qint64 bytesReceived, qint64 bytesTotal)
{
QNetworkReply *preply = (QNetworkReply*) sender();
disconnect(preply, SIGNAL(downloadProgress(qint64,qint64)),
this, SLOT(slotDownloadProgress(qint64,qint64)));
int statusCode = preply->attribute(QNetworkRequest::HttpStatusCodeAttribute).toInt();
if(statusCode == 200)
{
PageData pD = _replyPDLinks. value(preply);
QString contentType(preply->rawHeader("Content-type"));
if (contentType. indexOf("text/html", Qt::CaseInsensitive) == -1
&& contentType. indexOf("text/plain", Qt::CaseInsensitive) == -1)
{
QMap<HostData*, HostDownloadData>::iterator hDDsIt = _hDDs. find(pD. phD);
hDDsIt->curDownloadsCount--;
_replyPDLinks. remove(preply);
preply->disconnect();
preply->deleteLater();
pD. remove = true;
_outPDs. append(pD);
return;
}
}
}
void DT0::slotPDCameOnDownload(PageData pD)
{
_download(pD);
}
threads/parsingthread. h
#ifndef PARSINGTHREAD_H
#define PARSINGTHREAD_H
#include "includes. h"
#include "rccbasethread. h"
class ParsingThread : public RCCBaseThread
{
Q_OBJECT
public:
|
Из за большого объема этот материал размещен на нескольких страницах:
1 2 3 4 5 6 7 8 9 |


