RobotsTxt::RobotsTxt()

{

}

RobotsTxt::~RobotsTxt()

{

}

threads/downloadingthread. h

#ifndef DOWNLOADINGTHREAD_H

#define DOWNLOADINGTHREAD_H

#include "includes. h"

#include "rccbasethread. h"

class DownloadingThread : public RCCBaseThread

{

  Q_OBJECT

public:

  DownloadingThread(DataManager* pdM);

  virtual ~DownloadingThread();

  QNetworkAccessManager *pnam;

};

#endif // DOWNLOADINGTHREAD_H

threads/ downloadingthread. cpp

#include "downloadingthread. h"

DownloadingThread::DownloadingThread(DataManager* pdM):

  RCCBaseThread(pdM)

{

  pnam = new QNetworkAccessManager(this);

}

DownloadingThread::~DownloadingThread()

{

  pnam->deleteLater();

}

threads/dt0.h

#ifndef DOWNLOADINGTHREAD0_H

#define DOWNLOADINGTHREAD0_H

#include "downloadingthread. h"

#include "delayedpdpass. h"

#include "data_structures/hostdownloaddata. h"

class DT0 : public DownloadingThread

{

  Q_OBJECT

public:

  DT0(DataManager* pdM);

  virtual ~DT0();

protected:

  virtual void run();

  void _download(PageData &pD);

  QMap<QNetworkReply*, PageData> _replyPDLinks;

  int _pdChunkSizeMultiplier;

  PDContainer _outPDs;

  QMap<HostData*, HostDownloadData> _hDDs;

  bool _saveDuplicates;

protected slots:

  void slotFinished(QNetworkReply* preply);

  void slotError(QNetworkReply* preply);

  void slotDownloadProgress(qint64 bytesReceived, qint64 bytesTotal);

НЕ нашли? Не то? Что вы ищете?

  void slotPDCameOnDownload(PageData pD);

signals:

  void errorSignal(QNetworkReply* preply);

  void sleepSignal();

};

#endif // DOWNLOADINGTHREAD0_H

threads/ dt0.cpp

#include "dt0.h"

DT0::DT0(DataManager* pdM):

  DownloadingThread(pdM)

{

  RCCSettings *psett = RCCSettings::instance();

  _pdChunkSizeMultiplier = psett->value( "DownloadingThreadPDChunkSizeMultiplier", "r").toInt();

  _sleepTime = psett->value( "DownloadingThreadSleepTime", "r").toInt();

  _saveDuplicates = psett->value( "SaveDuplicates", "r").toBool();

  connect(pnam, SIGNAL(finished(QNetworkReply*)), this, SLOT(slotFinished(QNetworkReply*)));

  connect(this, SIGNAL(errorSignal(QNetworkReply*)), SLOT(slotError(QNetworkReply*)));

}

DT0::~DT0()

{

}

void DT0::run()

{

  while(true)

  {

  if (stop)

  break;

  QCoreApplication::processEvents();


  QVector<HostData*> phosts = _pdM->getHosts();

  for (HostData *phD: phosts)

  {

  QMap<HostData*, HostDownloadData>::iterator hDDsIt;

  if (!_hDDs. contains(phD))

  {

  hDDsIt = _hDDs. insert(phD, HostDownloadData());

  _pdM->getFreePDsForDownloading(hDDsIt->pDs, phD,_pdChunkSizeMultiplier);

  hDDsIt->pDsIt = hDDsIt->pDs. begin();

  }

  else

  hDDsIt = _hDDs. find(phD);

  if (hDDsIt->pDs. end() == hDDsIt->pDsIt)

  {

  _pdM->getFreePDsForDownloading(hDDsIt->pDs, phD, _pdChunkSizeMultiplier);

  hDDsIt->pDsIt = hDDsIt->pDs. begin();

  }

  while (hDDsIt->curDownloadsCount < phD->maxDownloadsAtTime

  && hDDsIt->pDs. end() != hDDsIt->pDsIt)

  {

  hDDsIt->curDownloadsCount++;

  int downloadDelay = phD->crawlDelay;

  if (!downloadDelay)

  {

  _download(*hDDsIt->pDsIt);

  }

  else

  {

  DelayedPDPass *pdPDP = new DelayedPDPass(*hDDsIt->pDsIt, downloadDelay, this);

  connect(pdPDP, SIGNAL(pDIsReady(PageData)),

  this, SLOT(slotPDCameOnDownload(PageData)));

  }

  hDDsIt->pDsIt++;

  }

  }

  if (_outPDs. size() >= _pdChunkSizeMultiplier*2)

  {

  _pdM->updatePDs(_outPDs);

  _outPDs. clear();

  }

  else

  {

  bool noDataForSomeOfHosts = false;

  for (HostData *phD: phosts)

  {

  QMap<HostData*, HostDownloadData>::iterator hDDsIt = _hDDs. find(phD);

  if(hDDsIt->pDs. end() == hDDsIt->pDsIt)

  {

  noDataForSomeOfHosts = true;

  break;

  }

  }

  if (noDataForSomeOfHosts)

  {

  _pdM->updatePDs(_outPDs);

  _outPDs. clear();

  }

  }

  if (_sleepTime)

  msleep(_sleepTime);

  }

}

void DT0::_download(PageData &pD)

{

  pD. downloadAttempts++;

  QString url = pD. phD->str  + pD. normalizedUrl;

  QNetworkRequest request(url);

  request. setRawHeader("User-Agent", RCCConsts::USER_AGENT);

  QNetworkReply *preply = pnam->get(request);

  connect(preply, SIGNAL(downloadProgress(qint64,qint64)),

  this, SLOT(slotDownloadProgress(qint64,qint64)));

  _replyPDLinks. insert(preply, pD);

}

void DT0::slotFinished(QNetworkReply *preply)

{

  PageData pD = _replyPDLinks. value(preply);

  QMap<HostData*, HostDownloadData>::iterator hDDsIt = _hDDs. find(pD. phD);

  hDDsIt->curDownloadsCount--;

  QNetworkReply::NetworkError error = preply->error();

  if (error!= QNetworkReply::NoError)

  {

  if ((error == QNetworkReply::ContentNotFoundError

  || error == QNetworkReply::ContentAccessDenied

  || error == QNetworkReply::ContentOperationNotPermittedError)

  || pD. downloadAttempts > RCCConsts::MAX_DOWNLOAD_ATTEMPTS)

  {

  emit errorSignal(preply);

  return;

  }

  else

  {

  _replyPDLinks. remove(preply);

  _outPDs. append(pD);

  switch(error)

  {

  case QNetworkReply::ServiceUnavailableError:

  _pdM->insertLogItem(DataManager::CommonLog,

  PDLogItem(

  QString("Download will be attempted again (Service anavailable). Was ")

  + QString::number(pD. downloadAttempts) + QString(" attempts: "), pD));

  break;

  }

  return;

  }

  }

  int statusCode = preply->attribute(QNetworkRequest::HttpStatusCodeAttribute).toInt();

  if(statusCode == 301 || statusCode == 302 || statusCode == 303)

  {

  _replyPDLinks. remove(preply);

  QUrl redirectUrl = preply->attribute(QNetworkRequest::RedirectionTargetAttribute).toUrl();

  preply->deleteLater();

  _pdM->insertLogItem(DataManager::CommonLog,

  PDLogItem("Redirected to "+redirectUrl. toString()+": ", pD));

  if (((pD. phD->host == redirectUrl. host() && pD. phD->protocol == redirectUrl. scheme())

  || redirectUrl. isRelative())

  && pD. downloadAttempts <= RCCConsts::MAX_DOWNLOAD_ATTEMPTS)

  {

  pD. downloadAttempts++;

  QString query = redirectUrl. query();

  QString redirectUrlStr = redirectUrl. path();

  if (query. length())

  redirectUrlStr += "?" + redirectUrl. query();

  pD. url = redirectUrlStr;

  pD. normalizedUrl = PageData::normalizeUrl(redirectUrlStr);

  _outPDs. append(pD);

  }

  else

  {

  pD. remove = true;

  _outPDs. append(pD);

  }

  return;

  }

  _replyPDLinks. remove(preply);

  _pdM->insertLogItem(DataManager::CommonLog, PDLogItem("Downloaded: ", pD));

  QString content = QString(preply->readAll());

  std::pair<bool, PageData> contentSeen = _pdM->contentSeen(pD. phD, content);

  if (!contentSeen. first)

  {

  pD. content = content;

  pD. contentHash = PageData::hashContent(pD. content);

  pD. downloaded = true;

  }

  else

  {

  pD. replaceId =contentSeen. second. id;

  if (_saveDuplicates)

  {

  pD. errorCode = RCCConsts::CONTENT_DUPLICATE;

  pD. downloaded = true;

  }

  else

  {

  pD. remove = true;

  }

  }

  _outPDs. append(pD);

  preply->deleteLater();

}

void DT0::slotError(QNetworkReply *preply)

{

  PageData pd = _replyPDLinks. take(preply);

  pd. errorCode = preply->error();

  pd. downloaded = true;

  _outPDs. append(pd);

  _pdM->insertLogItem(DataManager::ErrorLog,

  PDLogItem(preply->errorString(), pd));

  preply->deleteLater();

}

void DT0::slotDownloadProgress(qint64 bytesReceived, qint64 bytesTotal)

{

  QNetworkReply *preply = (QNetworkReply*) sender();

  disconnect(preply, SIGNAL(downloadProgress(qint64,qint64)),

  this, SLOT(slotDownloadProgress(qint64,qint64)));

  int statusCode = preply->attribute(QNetworkRequest::HttpStatusCodeAttribute).toInt();

  if(statusCode == 200)

  {

  PageData pD = _replyPDLinks. value(preply);

  QString contentType(preply->rawHeader("Content-type"));

  if (contentType. indexOf("text/html", Qt::CaseInsensitive) == -1

  && contentType. indexOf("text/plain", Qt::CaseInsensitive) == -1)

  {

  QMap<HostData*, HostDownloadData>::iterator hDDsIt = _hDDs. find(pD. phD);

  hDDsIt->curDownloadsCount--;

  _replyPDLinks. remove(preply);

  preply->disconnect();

  preply->deleteLater();

  pD. remove = true;

  _outPDs. append(pD);

  return;

  }

  }

}

void DT0::slotPDCameOnDownload(PageData pD)

{

  _download(pD);

}

threads/parsingthread. h

#ifndef PARSINGTHREAD_H

#define PARSINGTHREAD_H

#include "includes. h"

#include "rccbasethread. h"

class ParsingThread : public RCCBaseThread

{

  Q_OBJECT

public:

Из за большого объема этот материал размещен на нескольких страницах:
1 2 3 4 5 6 7 8 9