ParsingThread(DataManager* pdM);

  virtual ~ParsingThread();

};

#endif // PARSINGTHREAD_H

threads/ parsingthread. cpp

#include "parsingthread. h"

ParsingThread::ParsingThread(DataManager* pdM):

  RCCBaseThread(pdM)

{

}

ParsingThread::~ParsingThread()

{

}

threads/pt0.h

#ifndef PARSINGTHREAD0_H

#define PARSINGTHREAD0_H

#include "parsingthread. h"

#include "helpercreator. h"

class PT0 : public ParsingThread

{

public:

  PT0(DataManager* pdM);

  virtual ~PT0();

protected:

  virtual void run();

  void _processPageData(PageData& pD, PDPAContainer &pDPAs) const;

  void _extractUrls(QVector<QString> &hrefs, const QString &content, const int &bodyPos) const;

  bool _isResource(const QString& url) const;

  QString _getBaseUrl(const QString &content) const;

  QVector<std::pair<int, int>> _getForbiddenZones(const QString &content, const int &bodyPos) const;

  void _manageForbiddenZones(int &pos, const QVector<std::pair<int, int>> &forbiddenZones) const;

  QVector<QString> _resourceExtensions;

  int _pdChunkSize;

  RobotsTxt *_prTxt;

};

#endif // PARSINGTHREAD0_H

threads/ pt0.cpp

#include "pt0.h"

PT0::PT0(DataManager* pdM):

  ParsingThread(pdM),

  _resourceExtensions({

  ".jpg",

  ".jpeg",

  ".js",

  ".ico",

  ".js",

  ".css",

  ".png",

  ".pdf",

НЕ нашли? Не то? Что вы ищете?

  ".rar",

  ".zip",

  ".doc",

  ".docx",

  ".xls",

  ".xlsx",

  ".pdf",

  ".mp3",

  ".djvu",

  ".rtf",

  ".ppt",

  ".txt",

  ".pptx",

  ".gz",

  ".gif",

  ".xml",

  ".tif",

  ".tiff",

  ".flv",

  ".avi",

  ".mp3",

  ".mkv",

  ".flac",

  ".ogg",

  ".mp4",

  ".exe",

  ".msi",

  ".deb",

  ".zip.001",

  ".zip.002",

  ".svg",

  ".odt",

  ".7z",

  ".ppsx"

  })

{

  RCCSettings *psett = RCCSettings::instance();

  _pdChunkSize = psett->value( "ParsingThreadPDChunkSize", "r").toInt();

  _sleepTime = psett->value( "ParsingThreadSleepTime", "r").toInt();

  QString robotsTxtClass = psett->value( "RobotsTxtClass", "r").toString();

  _prTxt = HelperCreator::robotsTxt(robotsTxtClass);

}

PT0::~PT0()

{

  delete _prTxt;

}

void PT0::run()

{

  PDContainer pDs;

  PDPAContainer pDPAs;

  while (true)

  {

  if (stop)

  break;

  _pdM->getFreePDsForParsing(pDs,_pdChunkSize);

  if (!pDs. count())

  {

  msleep(_sleepTime);

  continue;

  }

  pDPAs. reserve(RCCConsts::APPROXIMATE_HREFS_PER_PAGE * pDs. count());

  for (PageData& pD: pDs)

  {

  _processPageData(pD, pDPAs);

  }

  _pdM->addPDsAndPAs(pDPAs);

  _pdM->updatePDs(pDs);

  pDPAs. clear();

  }

}

void PT0::_processPageData(PageData &pD, PDPAContainer &pDPAs) const

{

  QString &content = pD. content;

  int bodyPos = content. indexOf("<body");

  if (bodyPos == -1)

  {

  bodyPos = content. indexOf("<BODY");

  if (bodyPos == -1)

  bodyPos = 0;

  }

  QVector<QString> hrefs;

  hrefs. reserve(RCCConsts::APPROXIMATE_HREFS_PER_PAGE);

  _extractUrls(hrefs, content, bodyPos);

  pD. content = "";

  for(QString &href: hrefs)

  {

  QUrl u(href);


  u = u. adjusted(QUrl::RemoveFragment);

  QString hrefNoFragment = u. toString();

  if (!u. isValid() || _isResource(hrefNoFragment))

  {

  continue;

  }

  //если относительная ссылка начинается не со слэша

  if (u. isRelative() && href. at(0) != '/')

  {

  QUrl pDUrl(pD. normalizedUrl);

  //pDUrl = pDUrl. adjusted(QUrl::StripTrailingSlash);

  pDUrl = pDUrl. adjusted(QUrl::RemoveFilename);

  href = pDUrl. path() + href;

  u. setUrl(pD. phD->str + href);

  }

  if (((u. host() == "" && u. isRelative()) || u. host() == pD. phD->host))

  {

  QString normalizedUrl = PageData::normalizeUrl(href);

  if(_prTxt->allowUrl(normalizedUrl, pD. phD->rules))

  {

  pDPAs. append(PDPACreateData(href, normalizedUrl, pD));

  }

  }

  else

  {

  pD. outDegree++;

  }

  }

  pD. parsed = true;

}

void PT0::_extractUrls(QVector<QString> &hrefs,  const QString &content, const int &bodyPos) const

{

  QString base = _getBaseUrl(content);

  QVector<std::pair<int, int>> forbiddenZones = _getForbiddenZones(content, bodyPos);

  QChar closingSymbol;

  int hrefPos = bodyPos;

  while ((hrefPos = content. indexOf(" href", hrefPos, Qt::CaseInsensitive)) != -1)

  {

  int oldHrefPos = hrefPos;

  _manageForbiddenZones(hrefPos, forbiddenZones);

  if (oldHrefPos!= hrefPos)

  continue;

  //Находим начало ссылки и ограничивающий символ

  hrefPos += 5;// href

  bool foundEqual = false;

  while(content. at(hrefPos) == ' ' || content. at(hrefPos) == '=')

  {

  if (content. at(hrefPos) == '=')

  foundEqual = true;

  hrefPos++;

  }

  if (!foundEqual)

  continue;

  if (content. at(hrefPos) == '\'' || content. at(hrefPos) == '"')

  {

  closingSymbol = content. at(hrefPos);

  hrefPos++;

  }

  else

  {

  closingSymbol = ' ';

  }

  //определяем конец ссылки

  int hrefEnd = hrefPos;

  while(content. at(hrefEnd) != closingSymbol

  &&(closingSymbol!= ' ' || content. at(hrefEnd) != '>'))

  {

  hrefEnd++;

  }

  QString href = content. mid(hrefPos, hrefEnd - hrefPos).trimmed();

  if (href. length())

  {

  QUrl u(href);

  if (u. isRelative() && href. at(0) != '/')

  href = base + href;

  hrefs. append(href);

  }

  hrefPos = hrefEnd;

  }

}

QString PT0::_getBaseUrl(const QString &content) const

{

  QChar closingSymbol;

  QString base("");

  int baseStart, baseEnd;

  baseStart = content. indexOf("<base");

  if (baseStart!= -1)

  {

  baseEnd = content. indexOf(">", baseStart);

  if (baseEnd!= -1)

  {

  QString baseTag = content. mid(baseStart, baseEnd);

  int baseHrefPos = baseTag. indexOf("href");

  baseHrefPos += 5;

  if (baseHrefPos!= -1)

  {

  while(baseTag. at(baseHrefPos) == ' ' || baseTag. at(baseHrefPos) == '=')

  {

  baseHrefPos++;

  }

  if (baseTag. at(baseHrefPos) == '\'' || baseTag. at(baseHrefPos) == '"')

  {

  closingSymbol = baseTag. at(baseHrefPos);

  baseHrefPos++;

  }

  else

  {

  closingSymbol = ' ';

  }

  //определяем конец ссылки

  int baseHrefEnd = baseHrefPos;

  while(baseTag. at(baseHrefEnd) != closingSymbol)

  {

  baseHrefEnd++;

  }

  base = baseTag. mid(baseHrefPos, baseHrefEnd - baseHrefPos);

  }

  }

  QUrl u(base);

  u = u. adjusted(QUrl::RemoveFilename);

  base = u. toString();

  }

  return base;

}

QVector<std::pair<int, int> > PT0::_getForbiddenZones(const QString &content,

  const int &bodyPos) const

{

  QVector<std::pair<int, int>> result;

  int end;

  //комментарии

  int start = bodyPos;

  while((start = content. indexOf("<!--",start)) != -1)

  {

  end = content. indexOf("-->", start);

  if (end == -1)

  end = content. length() - 1;

  result. append(std::make_pair(start, end + 3));

  start = end +3;

  }

  //style

  start = bodyPos;

  while((start = content. indexOf("<style",start, Qt::CaseInsensitive)) != -1)

  {

  int oldStart = start;

  _manageForbiddenZones(start, result);

  if (oldStart == start)

  {

  end = content. indexOf("</style>", start, Qt::CaseInsensitive);

  if (end == -1)

  end = content. length() - 1;

  result. append(std::make_pair(start, end + 8));

  start = end +8;

  }

  }

  //script

  start = bodyPos;

  while((start = content. indexOf("<script",start, Qt::CaseInsensitive)) != -1)

  {

  int oldStart = start;

  _manageForbiddenZones(start, result);

  if (oldStart == start)

  {

  end = content. indexOf("</script>", start, Qt::CaseInsensitive);

  if (end == -1)

  end = content. length() - 1;

  result. append(std::make_pair(start, end + 8));

  start = end +8;

  }

  }

  return result;

}

void PT0::_manageForbiddenZones(int &pos, const QVector<std::pair<int, int>> &forbiddenZones) const

{

  for (const std::pair<int, int> &zone: forbiddenZones)

  {

  if (pos > zone. first && pos < zone. second)

  {

  pos = zone. second;

  break;

  }

  }

}

bool PT0::_isResource(const QString &url) const

{

  QUrl u(url);

  QString forCheck = u. path();

  for (const QString &ext: _resourceExtensions)

  {

  if (forCheck. right(ext. length()).toLower() == ext)

  return true;

  }

  return false;

}

threads/rccbasethread. h

#ifndef RCCBASETHREAD_H

#define RCCBASETHREAD_H

#include "includes. h"

#include "rccconsts. h"

#include "data_managers/datamanager. h"

class RCCBaseThread: public QThread

{

  Q_OBJECT

public:

  RCCBaseThread(DataManager* pdM);

  void msleep(unsigned long msecs);

  bool stop;

protected:

  DataManager *_pdM;

  int _sleepTime;

};

#endif // RCCBASETHREAD_H

threads/ rccbasethread. cpp

#include "rccbasethread. h"

RCCBaseThread::RCCBaseThread(DataManager* pdM):

  _pdM(pdM),

  _sleepTime(RCCConsts::THREAD_SLEEP_TIME_MS),

  stop(false)

{

}

void RCCBaseThread::msleep(unsigned long msecs)

{

  QThread::msleep(msecs);

}

hreads/routinethread. h

#ifndef ROUTINETHREAD_H

#define ROUTINETHREAD_H

#include "includes. h"

#include "result_unloaders/testru. h"

#include "rccbasethread. h"

class RoutineThread : public RCCBaseThread

{

  Q_OBJECT

public:

  RoutineThread(DataManager* pdM);

  virtual ~RoutineThread();

};

#endif // ROUTINETHREAD_H

threads/ routinethread. cpp

#include "routinethread. h"

RoutineThread::RoutineThread(DataManager* pdM):

  RCCBaseThread(pdM)

{

}

RoutineThread::~RoutineThread()

{

}

threads/rt0.h

#ifndef RT0_H

#define RT0_H

#include "routinethread. h"

class RT0 : public RoutineThread

{

  Q_OBJECT

public:

  RT0(DataManager* pdM);

  virtual ~RT0();

protected:

  virtual void run();

  void _log();

  void _logCommon();

  void _logErrors();

  QFile _cLFile;

  QFile _eLFile;

  bool _displayCL;

  bool _displayEL;

  QTextStream _cLTS;

  QTextStream _eLTS;

};

#endif // RT0_H

threads/ rt0.cpp

#include "rt0.h"

RT0::RT0(DataManager* pdM):

  RoutineThread(pdM)

{

  RCCSettings *psett = RCCSettings::instance();


  _sleepTime = psett->value("RoutineThreadSleepTime", "r").toInt();

  QString commonLogFileName = psett->value("CommonLogFile", "r").toString();

  _cLFile. setFileName(commonLogFileName);

  _cLFile. open(QIODevice::WriteOnly);

  _cLTS. setDevice(&_cLFile);

  QString errorLogFileName = psett->value("ErrorLogFile", "r").toString();

  _eLFile. setFileName(errorLogFileName);

  _eLFile. open(QIODevice::WriteOnly);

  _eLTS. setDevice(&_eLFile);

  _displayCL = psett->value("DisplayCommonLog", "r").toBool();

  _displayEL = psett->value("DisplayErrorLog", "r").toBool();

}

RT0::~RT0()

{

  _cLFile. close();

  _eLFile. close();

}

void RT0::run()

{

  while (true)

  {

  if (stop)

  {

  _displayCL = false;

  _displayEL = false;

  }

  _log();

  if (stop)

  break;

  msleep(_sleepTime);

  }

}

void RT0::_log()

{

  _logCommon();

  _logErrors();

}

void RT0::_logCommon()

{

  PDLIContainer *ppDLIs = _pdM->getLog(DataManager::CommonLog);

  while (!ppDLIs->isEmpty())

  {

  PDLogItem pDLI = ppDLIs->takeFirst();

  QString pDStr = pDLI. pD. id? pDLI. pD. toString() : "";

  _cLTS << """" + pDLI. message + """;" + pDStr << endl;

  if (_displayCL)

  {

  qDebug().noquote() << pDLI. message + pDStr << endl;

  }

  }

  delete ppDLIs;

}

void RT0::_logErrors()

{

  PDLIContainer *ppDLIs = _pdM->getLog(DataManager::ErrorLog);

  while (!ppDLIs->isEmpty())

  {

  PDLogItem pDLI = ppDLIs->takeFirst();

  QString pDStr = pDLI. pD. toString();

  _eLTS << """" + pDLI. message + """;" + pDStr << endl;

  if (_displayEL)

  {

  qDebug().noquote() << "Error: " + pDLI. message + ";" + pDStr << endl;

  }

  }

  delete ppDLIs;

}

Из за большого объема этот материал размещен на нескольких страницах:
1 2 3 4 5 6 7 8 9