return res;
}
uint PageData::hashContent(const QString &content)
{
return qHash(content);
}
QString PageData::normalizeUrl(const QString &urlStr)
{
QString ampReplaced(urlStr);
//замена на случай кривого кода на сайте
while(ampReplaced. indexOf("&") != -1)
{
ampReplaced. replace("&", "&");
}
ampReplaced. replace(""", "%22");
QUrl url(ampReplaced);
url = url. adjusted(QUrl::RemoveFragment | QUrl::NormalizePathSegments);
QUrlQuery query(url);
QList<QPair<QString, QString> > list = query. queryItems();
std::sort(list. begin(), list. end(),
[](const QPair<QString, QString> &a, const QPair<QString, QString> &b)
{return a. first < b. first;}
);
QString queryString = "";
for(auto item: list)
{
if (queryString. length())
queryString += "&";
if (item. first. length())
queryString += item. first + "=" +item. second;
}
QString res;
res = url. path();
if (queryString. length())
res += '?' + queryString;
if ((!res. length()) || res. at(0) != '/')
res = "/" + res;
return res;
}
std::size_t hash_value(const QString &x) { return qHash(x); }
data_structures/pastore. h
#ifndef PASTORE
#define PASTORE
#include "pagearc. h"
using namespace boost::multi_index;
namespace PA
{
struct ById{};
struct ByFromAndTo{};
struct ByTo{};
struct ByFrom{};
struct FromChange : public std::unary_function<PageArc, void>
{
ulong from;
FromChange(const ulong& _from) : from(_from) {}
void operator()(PageArc& pA)
{
pA. from = from;
}
};
struct ToChange : public std::unary_function<PageArc, void>
{
ulong to;
ToChange(const ulong& _to) : to(_to) {}
void operator()(PageArc& pA)
{
pA. to = to;
}
};
}
typedef boost::multi_index_container<PageArc,
indexed_by<
ordered_unique<
tag<PA::ById>, member<PageArc, ulong, &PageArc::id>
>,
ordered_non_unique<
tag<PA::ByTo>, member<PageArc, ulong, &PageArc::to>
>,
ordered_non_unique<
tag<PA::ByFrom>, member<PageArc, ulong, &PageArc::from>
>,
hashed_unique<
tag<PA::ByFromAndTo>, composite_key<
PageArc,
member<PageArc, ulong,&PageArc::from>,
member<PageArc, ulong,&PageArc::to>
>
>
>
> PAStore;
#endif // PASTORE
data_structures/pdlogitem. h
#ifndef PDLOGITEM_H
#define PDLOGITEM_H
#include "pagedata. h"
struct PDLogItem
{
PDLogItem();
PDLogItem(const QString &message, const PageData &pD);
QString message;
PageData pD;
};
#endif // PDLOGITEM_H
data_structures/ pdlogitem. cpp
#include "pdlogitem. h"
PDLogItem::PDLogItem() :
message(""),
pD(PageData())
{
}
PDLogItem::PDLogItem(const QString &message, const PageData &pD):
message(message),
pD(pD)
{
}
data_structures/pdpacreatedata. h
#ifndef PDANDPACREATEDATA_H
#define PDANDPACREATEDATA_H
#include "pagedata. h"
struct PDPACreateData
{
PDPACreateData();
PDPACreateData(const QString &url, const QString &normalizedUrl, const PageData &pD);
QString url;
QString normalizedUrl;
PageData pD;
};
#endif // PDANDPACREATEDATA_H
data_structures/ pdpacreatedata. cpp
#include "pdpacreatedata. h"
PDPACreateData::PDPACreateData():
url(""),
normalizedUrl(""),
pD(PageData())
{
}
PDPACreateData::PDPACreateData(const QString &url, const QString &normalizedUrl,
const PageData &pD) :
url(url),
normalizedUrl(normalizedUrl),
pD(pD)
{
}
data_structures/pdstore. h
#ifndef PDSTORE_H
#define PDSTORE_H
#include "includes. h"
#include "pagedata. h"
using namespace boost::multi_index;
namespace PD
{
struct ById{};
struct ByBlocked{};
struct ByPhDAndBlockedAndDownloaded{};
struct ByBlockedAndDownloadedAndParsedAndErrorCode{};
struct ByPhDAndContentHash{};
struct ByUrlAndPhD{};
struct ByPhDAndNormalizedUrl{};
struct ByIdFrom{};
struct BlockedChange : public std::unary_function<PageData, void>
{
bool b;
BlockedChange(const bool& _b) : b(_b) {}
void operator()(PageData& pd)
{
pd. blocked = b;
}
};
struct LevelChange : public std::unary_function<PageData, void>
{
uint l;
LevelChange(const uint& _l) : l(_l) {}
void operator()(PageData& pd)
{
pd. level = l;
}
};
}
typedef boost::multi_index_container<PageData,
indexed_by<
ordered_non_unique<
tag<PD::ById>, member<PageData, ulong, &PageData::id>
>,
ordered_non_unique<
tag<PD::ByBlocked>, member<PageData, bool, &PageData::blocked>
>,
ordered_non_unique<
tag<PD::ByPhDAndBlockedAndDownloaded>, composite_key<
PageData,
member<PageData, HostData*,&PageData::phD>,
member<PageData, bool,&PageData::blocked>,
member<PageData, bool,&PageData::downloaded>
>
>,
ordered_non_unique<
tag<PD::ByBlockedAndDownloadedAndParsedAndErrorCode>, composite_key<
PageData,
member<PageData, bool,&PageData::blocked>,
member<PageData, bool,&PageData::downloaded>,
member<PageData, bool,&PageData::parsed>,
member<PageData, uint,&PageData::errorCode>
>
>,
ordered_non_unique<
tag<PD::ByPhDAndContentHash>, composite_key<
PageData,
member<PageData, HostData*,&PageData::phD>,
member<PageData, uint,&PageData::contentHash>
>
>,
ordered_unique<
tag<PD::ByPhDAndNormalizedUrl>, composite_key<
PageData,
member<PageData, HostData*,&PageData::phD>,
member<PageData, QString,&PageData::normalizedUrl>
>
>,
ordered_non_unique<
tag<PD::ByIdFrom>, member<PageData, ulong, &PageData::idFrom>
>
>
> PDStore;
typedef PDStore::index<PD::ByBlocked>::type BlockedList;
#endif // PDSTORE_H
data_structures/robotstxtrule/.h
#ifndef ROBOTSTXTRULES_H
#define ROBOTSTXTRULES_H
#include "includes. h"
struct RobotsTxtRule
{
enum RuleType
{
Allow,
Disallow
};
RobotsTxtRule();
RobotsTxtRule(const RobotsTxtRule::RuleType &type,
const QString &templ);
RobotsTxtRule::RuleType type;
QString templateStr;
};
typedef QVector<RobotsTxtRule> RTRContainer;
#endif // ROBOTSTXTRULES_H
data_structures/ robotstxtrule. cpp
#include "robotstxtrule. h"
RobotsTxtRule::RobotsTxtRule(){}
RobotsTxtRule::RobotsTxtRule(const RobotsTxtRule::RuleType &type,
const QString &templ):
type(type),
templateStr(templ)
{
}
result_unloaders/gephisitegragh. h
#ifndef GEPHISITEGRAPHRU_H
#define GEPHISITEGRAPHRU_H
#include "resultunloader. h"
class GephiSiteGraphRU : public ResultUnloader
{
public:
GephiSiteGraphRU(DataManager *pdM);
~GephiSiteGraphRU();
virtual void unloadResult();
protected:
void _manageHosts();
void _unloadNodes();
void _unloadEdges();
void _endUnload();
QHash<ulong, ulong> _innerIdGephiId;
QVector<HostData*> _hosts;
QMap<HostData*, QFile*> _nodesFiles;
QMap<HostData*, QTextStream*> _nTSs;
QMap<HostData*, QFile*> _edgesFiles;
QMap<HostData*, QTextStream*> _eTSs;
QMap<HostData*, ulong> _pagesUnloaded;
QMap<ulong, HostData*> _idHost;
const QString NODES_CSV_FILE_HEADER = "id;label;level;out_degree";
const QString EDGES_CSV_FILE_HEADER = "source;target";
};
#endif // GEPHISITEGRAPHRU_H
result_unloaders/ gephisitegragh. cpp
#include "gephisitegraphru. h"
GephiSiteGraphRU::GephiSiteGraphRU(DataManager *pdM):
ResultUnloader(pdM)
{
}
GephiSiteGraphRU::~GephiSiteGraphRU()
{
}
void GephiSiteGraphRU::unloadResult()
{
_manageHosts();
_unloadNodes();
_unloadEdges();
_endUnload();
}
void GephiSiteGraphRU::_manageHosts()
{
RCCSettings *psett = RCCSettings::instance();
QString folder = psett->value( "ResultFolder", "r").toString();
_hosts = _pdM->getHosts();
for (HostData* phost: _hosts)
{
QString fNPart = phost->protocol + "_" + phost->host + ".csv";
QFile* pnodes = new QFile(folder + "/nodes_" + fNPart);
pnodes->open(QIODevice::WriteOnly);
QTextStream* pnTS = new QTextStream(pnodes);
_nodesFiles. insert(phost, pnodes);
_nTSs. insert(phost, pnTS);
QFile* pedges = new QFile(folder + "/edges_" + fNPart);
pedges->open(QIODevice::WriteOnly);
QTextStream* peTS = new QTextStream(pedges);
|
Из за большого объема этот материал размещен на нескольких страницах:
1 2 3 4 5 6 7 8 9 |


