ParsingThread(DataManager* pdM);
virtual ~ParsingThread();
};
#endif // PARSINGTHREAD_H
threads/ parsingthread. cpp
#include "parsingthread. h"
ParsingThread::ParsingThread(DataManager* pdM):
RCCBaseThread(pdM)
{
}
ParsingThread::~ParsingThread()
{
}
threads/pt0.h
#ifndef PARSINGTHREAD0_H
#define PARSINGTHREAD0_H
#include "parsingthread. h"
#include "helpercreator. h"
class PT0 : public ParsingThread
{
public:
PT0(DataManager* pdM);
virtual ~PT0();
protected:
virtual void run();
void _processPageData(PageData& pD, PDPAContainer &pDPAs) const;
void _extractUrls(QVector<QString> &hrefs, const QString &content, const int &bodyPos) const;
bool _isResource(const QString& url) const;
QString _getBaseUrl(const QString &content) const;
QVector<std::pair<int, int>> _getForbiddenZones(const QString &content, const int &bodyPos) const;
void _manageForbiddenZones(int &pos, const QVector<std::pair<int, int>> &forbiddenZones) const;
QVector<QString> _resourceExtensions;
int _pdChunkSize;
RobotsTxt *_prTxt;
};
#endif // PARSINGTHREAD0_H
threads/ pt0.cpp
#include "pt0.h"
PT0::PT0(DataManager* pdM):
ParsingThread(pdM),
_resourceExtensions({
".jpg",
".jpeg",
".js",
".ico",
".js",
".css",
".png",
".pdf",
".rar",
".zip",
".doc",
".docx",
".xls",
".xlsx",
".pdf",
".mp3",
".djvu",
".rtf",
".ppt",
".txt",
".pptx",
".gz",
".gif",
".xml",
".tif",
".tiff",
".flv",
".avi",
".mp3",
".mkv",
".flac",
".ogg",
".mp4",
".exe",
".msi",
".deb",
".zip.001",
".zip.002",
".svg",
".odt",
".7z",
".ppsx"
})
{
RCCSettings *psett = RCCSettings::instance();
_pdChunkSize = psett->value( "ParsingThreadPDChunkSize", "r").toInt();
_sleepTime = psett->value( "ParsingThreadSleepTime", "r").toInt();
QString robotsTxtClass = psett->value( "RobotsTxtClass", "r").toString();
_prTxt = HelperCreator::robotsTxt(robotsTxtClass);
}
PT0::~PT0()
{
delete _prTxt;
}
void PT0::run()
{
PDContainer pDs;
PDPAContainer pDPAs;
while (true)
{
if (stop)
break;
_pdM->getFreePDsForParsing(pDs,_pdChunkSize);
if (!pDs. count())
{
msleep(_sleepTime);
continue;
}
pDPAs. reserve(RCCConsts::APPROXIMATE_HREFS_PER_PAGE * pDs. count());
for (PageData& pD: pDs)
{
_processPageData(pD, pDPAs);
}
_pdM->addPDsAndPAs(pDPAs);
_pdM->updatePDs(pDs);
pDPAs. clear();
}
}
void PT0::_processPageData(PageData &pD, PDPAContainer &pDPAs) const
{
QString &content = pD. content;
int bodyPos = content. indexOf("<body");
if (bodyPos == -1)
{
bodyPos = content. indexOf("<BODY");
if (bodyPos == -1)
bodyPos = 0;
}
QVector<QString> hrefs;
hrefs. reserve(RCCConsts::APPROXIMATE_HREFS_PER_PAGE);
_extractUrls(hrefs, content, bodyPos);
pD. content = "";
for(QString &href: hrefs)
{
QUrl u(href);
u = u. adjusted(QUrl::RemoveFragment);
QString hrefNoFragment = u. toString();
if (!u. isValid() || _isResource(hrefNoFragment))
{
continue;
}
//если относительная ссылка начинается не со слэша
if (u. isRelative() && href. at(0) != '/')
{
QUrl pDUrl(pD. normalizedUrl);
//pDUrl = pDUrl. adjusted(QUrl::StripTrailingSlash);
pDUrl = pDUrl. adjusted(QUrl::RemoveFilename);
href = pDUrl. path() + href;
u. setUrl(pD. phD->str + href);
}
if (((u. host() == "" && u. isRelative()) || u. host() == pD. phD->host))
{
QString normalizedUrl = PageData::normalizeUrl(href);
if(_prTxt->allowUrl(normalizedUrl, pD. phD->rules))
{
pDPAs. append(PDPACreateData(href, normalizedUrl, pD));
}
}
else
{
pD. outDegree++;
}
}
pD. parsed = true;
}
void PT0::_extractUrls(QVector<QString> &hrefs, const QString &content, const int &bodyPos) const
{
QString base = _getBaseUrl(content);
QVector<std::pair<int, int>> forbiddenZones = _getForbiddenZones(content, bodyPos);
QChar closingSymbol;
int hrefPos = bodyPos;
while ((hrefPos = content. indexOf(" href", hrefPos, Qt::CaseInsensitive)) != -1)
{
int oldHrefPos = hrefPos;
_manageForbiddenZones(hrefPos, forbiddenZones);
if (oldHrefPos!= hrefPos)
continue;
//Находим начало ссылки и ограничивающий символ
hrefPos += 5;// href
bool foundEqual = false;
while(content. at(hrefPos) == ' ' || content. at(hrefPos) == '=')
{
if (content. at(hrefPos) == '=')
foundEqual = true;
hrefPos++;
}
if (!foundEqual)
continue;
if (content. at(hrefPos) == '\'' || content. at(hrefPos) == '"')
{
closingSymbol = content. at(hrefPos);
hrefPos++;
}
else
{
closingSymbol = ' ';
}
//определяем конец ссылки
int hrefEnd = hrefPos;
while(content. at(hrefEnd) != closingSymbol
&&(closingSymbol!= ' ' || content. at(hrefEnd) != '>'))
{
hrefEnd++;
}
QString href = content. mid(hrefPos, hrefEnd - hrefPos).trimmed();
if (href. length())
{
QUrl u(href);
if (u. isRelative() && href. at(0) != '/')
href = base + href;
hrefs. append(href);
}
hrefPos = hrefEnd;
}
}
QString PT0::_getBaseUrl(const QString &content) const
{
QChar closingSymbol;
QString base("");
int baseStart, baseEnd;
baseStart = content. indexOf("<base");
if (baseStart!= -1)
{
baseEnd = content. indexOf(">", baseStart);
if (baseEnd!= -1)
{
QString baseTag = content. mid(baseStart, baseEnd);
int baseHrefPos = baseTag. indexOf("href");
baseHrefPos += 5;
if (baseHrefPos!= -1)
{
while(baseTag. at(baseHrefPos) == ' ' || baseTag. at(baseHrefPos) == '=')
{
baseHrefPos++;
}
if (baseTag. at(baseHrefPos) == '\'' || baseTag. at(baseHrefPos) == '"')
{
closingSymbol = baseTag. at(baseHrefPos);
baseHrefPos++;
}
else
{
closingSymbol = ' ';
}
//определяем конец ссылки
int baseHrefEnd = baseHrefPos;
while(baseTag. at(baseHrefEnd) != closingSymbol)
{
baseHrefEnd++;
}
base = baseTag. mid(baseHrefPos, baseHrefEnd - baseHrefPos);
}
}
QUrl u(base);
u = u. adjusted(QUrl::RemoveFilename);
base = u. toString();
}
return base;
}
QVector<std::pair<int, int> > PT0::_getForbiddenZones(const QString &content,
const int &bodyPos) const
{
QVector<std::pair<int, int>> result;
int end;
//комментарии
int start = bodyPos;
while((start = content. indexOf("<!--",start)) != -1)
{
end = content. indexOf("-->", start);
if (end == -1)
end = content. length() - 1;
result. append(std::make_pair(start, end + 3));
start = end +3;
}
//style
start = bodyPos;
while((start = content. indexOf("<style",start, Qt::CaseInsensitive)) != -1)
{
int oldStart = start;
_manageForbiddenZones(start, result);
if (oldStart == start)
{
end = content. indexOf("</style>", start, Qt::CaseInsensitive);
if (end == -1)
end = content. length() - 1;
result. append(std::make_pair(start, end + 8));
start = end +8;
}
}
//script
start = bodyPos;
while((start = content. indexOf("<script",start, Qt::CaseInsensitive)) != -1)
{
int oldStart = start;
_manageForbiddenZones(start, result);
if (oldStart == start)
{
end = content. indexOf("</script>", start, Qt::CaseInsensitive);
if (end == -1)
end = content. length() - 1;
result. append(std::make_pair(start, end + 8));
start = end +8;
}
}
return result;
}
void PT0::_manageForbiddenZones(int &pos, const QVector<std::pair<int, int>> &forbiddenZones) const
{
for (const std::pair<int, int> &zone: forbiddenZones)
{
if (pos > zone. first && pos < zone. second)
{
pos = zone. second;
break;
}
}
}
bool PT0::_isResource(const QString &url) const
{
QUrl u(url);
QString forCheck = u. path();
for (const QString &ext: _resourceExtensions)
{
if (forCheck. right(ext. length()).toLower() == ext)
return true;
}
return false;
}
threads/rccbasethread. h
#ifndef RCCBASETHREAD_H
#define RCCBASETHREAD_H
#include "includes. h"
#include "rccconsts. h"
#include "data_managers/datamanager. h"
class RCCBaseThread: public QThread
{
Q_OBJECT
public:
RCCBaseThread(DataManager* pdM);
void msleep(unsigned long msecs);
bool stop;
protected:
DataManager *_pdM;
int _sleepTime;
};
#endif // RCCBASETHREAD_H
threads/ rccbasethread. cpp
#include "rccbasethread. h"
RCCBaseThread::RCCBaseThread(DataManager* pdM):
_pdM(pdM),
_sleepTime(RCCConsts::THREAD_SLEEP_TIME_MS),
stop(false)
{
}
void RCCBaseThread::msleep(unsigned long msecs)
{
QThread::msleep(msecs);
}
hreads/routinethread. h
#ifndef ROUTINETHREAD_H
#define ROUTINETHREAD_H
#include "includes. h"
#include "result_unloaders/testru. h"
#include "rccbasethread. h"
class RoutineThread : public RCCBaseThread
{
Q_OBJECT
public:
RoutineThread(DataManager* pdM);
virtual ~RoutineThread();
};
#endif // ROUTINETHREAD_H
threads/ routinethread. cpp
#include "routinethread. h"
RoutineThread::RoutineThread(DataManager* pdM):
RCCBaseThread(pdM)
{
}
RoutineThread::~RoutineThread()
{
}
threads/rt0.h
#ifndef RT0_H
#define RT0_H
#include "routinethread. h"
class RT0 : public RoutineThread
{
Q_OBJECT
public:
RT0(DataManager* pdM);
virtual ~RT0();
protected:
virtual void run();
void _log();
void _logCommon();
void _logErrors();
QFile _cLFile;
QFile _eLFile;
bool _displayCL;
bool _displayEL;
QTextStream _cLTS;
QTextStream _eLTS;
};
#endif // RT0_H
threads/ rt0.cpp
#include "rt0.h"
RT0::RT0(DataManager* pdM):
RoutineThread(pdM)
{
RCCSettings *psett = RCCSettings::instance();
_sleepTime = psett->value("RoutineThreadSleepTime", "r").toInt();
QString commonLogFileName = psett->value("CommonLogFile", "r").toString();
_cLFile. setFileName(commonLogFileName);
_cLFile. open(QIODevice::WriteOnly);
_cLTS. setDevice(&_cLFile);
QString errorLogFileName = psett->value("ErrorLogFile", "r").toString();
_eLFile. setFileName(errorLogFileName);
_eLFile. open(QIODevice::WriteOnly);
_eLTS. setDevice(&_eLFile);
_displayCL = psett->value("DisplayCommonLog", "r").toBool();
_displayEL = psett->value("DisplayErrorLog", "r").toBool();
}
RT0::~RT0()
{
_cLFile. close();
_eLFile. close();
}
void RT0::run()
{
while (true)
{
if (stop)
{
_displayCL = false;
_displayEL = false;
}
_log();
if (stop)
break;
msleep(_sleepTime);
}
}
void RT0::_log()
{
_logCommon();
_logErrors();
}
void RT0::_logCommon()
{
PDLIContainer *ppDLIs = _pdM->getLog(DataManager::CommonLog);
while (!ppDLIs->isEmpty())
{
PDLogItem pDLI = ppDLIs->takeFirst();
QString pDStr = pDLI. pD. id? pDLI. pD. toString() : "";
_cLTS << """" + pDLI. message + """;" + pDStr << endl;
if (_displayCL)
{
qDebug().noquote() << pDLI. message + pDStr << endl;
}
}
delete ppDLIs;
}
void RT0::_logErrors()
{
PDLIContainer *ppDLIs = _pdM->getLog(DataManager::ErrorLog);
while (!ppDLIs->isEmpty())
{
PDLogItem pDLI = ppDLIs->takeFirst();
QString pDStr = pDLI. pD. toString();
_eLTS << """" + pDLI. message + """;" + pDStr << endl;
if (_displayEL)
{
qDebug().noquote() << "Error: " + pDLI. message + ";" + pDStr << endl;
}
}
delete ppDLIs;
}
|
Из за большого объема этот материал размещен на нескольких страницах:
1 2 3 4 5 6 7 8 9 |


