}
_mutexPA. unlock();
}
void BMICDM::_removePAsByToAndFrom(const QVector<ulong> &pDIdsToRemove)
{
_mutexPA. lock();
typedef PAStore::index<PA::ByTo>::type::iterator ByToIt;
PAStore::index<PA::ByTo>::type & to = _storePA. get<PA::ByTo>();
for (auto& id: pDIdsToRemove)
{
ByToIt it0, it1;
std::tie(it0,it1) = to. equal_range(id);
to. erase(it0, it1);
}
typedef PAStore::index<PA::ByFrom>::type::iterator ByFromIt;
PAStore::index<PA::ByFrom>::type & from = _storePA. get<PA::ByFrom>();
for (auto& id: pDIdsToRemove)
{
ByFromIt it0, it1;
std::tie(it0,it1) = from. equal_range(id);
from. erase(it0, it1);
}
_mutexPA. unlock();
}
PageData BMICDM::_findById(ulong id)
{
auto &pDsById = _storePD. get<PD::ById>();
PDStore::index<PD::ById>::type::iterator it0, it1;
it0 = pDsById. find(id);
it1 = pDsById. end();
if (it0 != it1)
{
return *it0;
}
return PageData();
}
void BMICDM::insertPAs(const QVector<PageArc> &pAs)
{
_mutexPA. lock();
for (const PageArc &pA: pAs)
{
_storePA. insert(pA);
}
_mutexPA. unlock();
}
data_managers/datamanager. h
#ifndef DATAMANAGER_H
#define DATAMANAGER_H
#include "includes. h"
#include "rccconsts. h"
#include "data_structures/pagedata. h"
#include "data_structures/pagearc. h"
#include "data_structures/pdlogitem. h"
#include "data_structures/pdandpacreatedata. h"
typedef QVector<PageArc> PAContainer;
typedef QQueue<PDLogItem> PDLIContainer;
typedef QVector<PDPACreateData> PDPAContainer;
class DataManager
{
public:
enum LogType
{
CommonLog,
ErrorLog
};
DataManager();
virtual ~DataManager();
virtual bool addHost(HostData* phD) = 0;
virtual bool allWorkIsDone() = 0;
virtual QVector<HostData*> getHosts() = 0;
virtual void getPDsStartingFromId(PDContainer &result, const ulong &id,
const int& countMultiplier = RCCConsts::PAGEDATA_CHUNK_SIZE_COUNT_MULTIPLIER) = 0;
virtual void getPAsStartingFromId(PAContainer &result, const ulong &id,
const int& count = RCCConsts::PAGEDATA_DEFAULT_CHUNK_SIZE) = 0;
virtual void getFreePDsForDownloading(PDContainer &result, HostData *phD,
const int& countMultiplier = RCCConsts::PAGEDATA_DEFAULT_CHUNK_SIZE) = 0;
virtual void getFreePDsForParsing(PDContainer &result, const int& count = RCCConsts::PAGEDATA_DEFAULT_CHUNK_SIZE) = 0;
virtual void insertPDs(const PDContainer &pDs) = 0;
virtual void addPDsAndPAs(const PDPAContainer &pDPAs) = 0;
virtual void updatePDs(const PDContainer &pDs) = 0;//метод должен удалять, если стоит remove
virtual std::pair<bool, PageData> contentSeen(HostData* phD, const QString& content) = 0;
virtual void insertPAs(const PAContainer& pAs) = 0;
virtual void changePAsByTo(const int &to, const int &newTo) = 0;
//Лог
virtual PDLIContainer* getLog(const DataManager::LogType &logType, const int size = INT_MAX) = 0;
virtual void insertLogItem(const DataManager::LogType &logType, const PDLogItem& pDLI) = 0;
virtual void insertLogItems(const DataManager::LogType &logType, const QVector<PDLogItem> &items) = 0;
};
#endif // DATAMANAGER_H
data_managers/datamanager. cpp
#include "datamanager. h"
DataManager::DataManager()
{
}
DataManager::~DataManager()
{
}
data_structures/hostdata. h
#ifndef HOSTDATA_H
#define HOSTDATA_H
#include <QString>
#include <QUrl>
#include "rccconsts. h"
#include "robotstxtrule. h"
struct HostData
{
HostData();
HostData(const QString& url, const int& maxDownloadsAtTime,
const int& crawlDelay, const int& maxCrawlLevel);
QString host;
uint crawlDelay;
QString protocol;
QString str;
uint maxDownloadsAtTime;
RTRContainer rules;
int maxCrawlLevel;
int port;
};
#endif // HOSTDATA_H
data_structures/ hostdata. cpp
#include "hostdata. h"
HostData::HostData()
{
}
HostData::HostData(const QString& url, const int& maxDownloadsAtTime,
const int& downloadDelay, const int& maxCrawlLevel):
crawlDelay(downloadDelay),
maxDownloadsAtTime(maxDownloadsAtTime),
maxCrawlLevel(maxCrawlLevel)
{
QUrl u(url);
protocol = u. scheme();
host = u. host().replace("/","");
port = u. port();
str = protocol + "://" + host;
if (port && port!= 80)
str += ":" + QString::number(port);
}
data_structures/ hostdownloaddata. h
#ifndef HOSTDOWNLOADDATA_H
#define HOSTDOWNLOADDATA_H
#include "pagedata. h"
struct HostDownloadData
{
HostDownloadData();
int curDownloadsCount;
PDContainer pDs;
PDContainer::iterator pDsIt;
};
#endif // HOSTDOWNLOADDATA_H
data_structures/ hostdownloaddata. cpp
#include "hostdownloaddata. h"
HostDownloadData::HostDownloadData():
curDownloadsCount(0)
{
}
data_structures/pagearc. h
#ifndef PAGEARC_H
#define PAGEARC_H
#include "includes. h"
struct PageArc
{
PageArc();
PageArc(const ulong &from, const ulong &to);
PageArc(const ulong &id, const ulong &from, const ulong &to);
ulong id;
ulong from;
ulong to;
static ulong newIdUnsafe();
static ulong newIdSafe();
static ulong lastId;
QString toString() const;
};
#endif // PAGEARC_H
data_structures/ pagearc. cpp
#include "pagearc. h"
ulong PageArc::lastId = 0;
PageArc::PageArc():
id(0)
{}
PageArc::PageArc(const ulong &from, const ulong &to):
id(0),
from(from),
to(to)
{
}
PageArc::PageArc(const ulong &id, const ulong &from, const ulong &to):
id(id),
from(from),
to(to)
{
}
ulong PageArc::newIdUnsafe()
{
lastId++;
return lastId;
}
ulong PageArc::newIdSafe()
{
static QMutex mut;
mut. lock();
ulong res = ++lastId;
mut. unlock();
return res;
}
QString PageArc::toString() const
{
return QString::number(id) + ";" + QString::number(from)
+ ";" + QString::number(to);
}
data_structures/pagedata. h
#ifndef PAGEDATA_H
#define PAGEDATA_H
#include "includes. h"
#include "hostdata. h"
struct PageData
{
PageData();//не присваивает id
PageData(const ulong &id, const QString& url, HostData *phD);
PageData(const ulong &id, const QString& url, const QString& normalizedUrl, HostData *phD);
QString toString() const;
static ulong newIdUnsafe();
static ulong newIdSafe();
static uint hashContent(const QString& content);
static ulong lastId;
static QString normalizeUrl(const QString &urlStr);
ulong id;
HostData* phD;
QString url;
QString normalizedUrl;
ulong idFrom;
uint level;
uint outDegree;
bool blocked;
bool downloaded;
bool parsed;
QString content;
uint contentHash;
uint errorCode;
ulong replaceId;
uint downloadAttempts;
bool remove;
};
typedef QVector<PageData> PDContainer;
std::size_t hash_value(const QString &x);
#endif // PAGEDATA_H
data_structures/ pagedata. cpp
#include "pagedata. h"
ulong PageData::lastId = 0;
PageData::PageData():
id(0),
url(""),
normalizedUrl(""),
level(0),
outDegree(0),
downloaded(false),
parsed(false),
blocked(false),
phD(nullptr),
content(""),
contentHash(0),
errorCode(0),
remove(false),
replaceId(0),
downloadAttempts(0),
idFrom(0)
{
}
PageData::PageData(const ulong &id, const QString &url, HostData *phD):
id(id),
url(url),
normalizedUrl(""),
level(0),
outDegree(0),
downloaded(false),
parsed(false),
blocked(false),
phD(phD),
content(""),
contentHash(0),
errorCode(0),
remove(false),
replaceId(0),
downloadAttempts(0),
idFrom(0)
{
}
PageData::PageData(const ulong &id, const QString &url, const QString &normalizedUrl, HostData *phD):
id(id),
url(url),
normalizedUrl(normalizedUrl),
level(0),
outDegree(0),
downloaded(false),
parsed(false),
blocked(false),
phD(phD),
content(""),
contentHash(0),
errorCode(0),
remove(false),
replaceId(0),
downloadAttempts(0),
idFrom(0)
{
}
QString PageData::toString() const
{
return QString::number(id) +";" + phD->str + ";\"" + normalizedUrl
+ "\";" + QString::number(idFrom) + "";
}
ulong PageData::newIdUnsafe()
{
lastId++;
return lastId;
}
ulong PageData::newIdSafe()
{
static QMutex mut;
mut. lock();
ulong res = ++lastId;
mut. unlock();
|
Из за большого объема этот материал размещен на нескольких страницах:
1 2 3 4 5 6 7 8 9 |


