_edgesFiles. insert(phost, pedges);
_eTSs. insert(phost, peTS);
_pagesUnloaded. insert(phost, 0);
}
}
void GephiSiteGraphRU::_unloadNodes()
{
for (QTextStream* ts: _nTSs)
{
*ts << NODES_CSV_FILE_HEADER << endl;
}
PDContainer pDs;
ulong id = 0;
_pdM->getPDsStartingFromId(pDs, id);
while(pDs. size())
{
for(const PageData& pD: pDs)
{
if (!pD. parsed)
continue;
ulong gephiId = _pagesUnloaded. value(pD. phD);
_innerIdGephiId. insert(pD. id, gephiId);
QTextStream *ts = _nTSs. value(pD. phD);
*ts << gephiId << ";" << "\"" + pD. normalizedUrl + "\"" << ";"
<< pD. level << ";" << pD. outDegree << endl;
gephiId++;
_pagesUnloaded. insert(pD. phD, gephiId);
_idHost. insert(pD. id, pD. phD);
}
id = pDs. last().id + 1;
_pdM->getPDsStartingFromId(pDs, id);
}
}
void GephiSiteGraphRU::_unloadEdges()
{
for (QTextStream* ts: _eTSs)
{
*ts << EDGES_CSV_FILE_HEADER << endl;
}
PAContainer pAs;
ulong id = 0;
_pdM->getPAsStartingFromId(pAs, id);
while(pAs. size())
{
for(const PageArc& pA: pAs)
{
ulong to = _innerIdGephiId. value(pA. to, 0);
ulong from = _innerIdGephiId. value(pA. from, 0);
if (!(from || to))
continue;
HostData* phD = _idHost. value(pA. to, nullptr);
if (phD == nullptr)
continue;
QTextStream *ts = _eTSs. value(phD);
*ts << from << ";" << to << endl;
}
id = pAs. last().id + 1;
_pdM->getPAsStartingFromId(pAs, id);
}
}
void GephiSiteGraphRU::_endUnload()
{
for (auto v: _nodesFiles)
{
v->close();
delete v;
}
for (auto v: _edgesFiles)
{
v->close();
delete v;
}
for (auto v: _nTSs)
{
delete v;
}
for (auto v: _eTSs)
{
delete v;
}
_nodesFiles. clear();
_edgesFiles. clear();
_nTSs. clear();
_eTSs. clear();
_hosts. clear();
_innerIdGephiId. clear();
_pagesUnloaded. clear();
}
result_unloaders/resultunloader. h
#ifndef RESULTUNLOADER_H
#define RESULTUNLOADER_H
#include "includes. h"
#include "data_managers/datamanager. h"
class ResultUnloader
{
public:
ResultUnloader(DataManager *pdM);
virtual ~ResultUnloader();
virtual void unloadResult() = 0;
protected:
DataManager *_pdM;
};
#endif // RESULTUNLOADER_H
result_unloaders/ resultunloader. cpp
#include "resultunloader. h"
ResultUnloader::ResultUnloader(DataManager *pdM):
_pdM(pdM)
{
}
ResultUnloader::~ResultUnloader()
{
}
result_unloaders/testru. h
#ifndef TESTRU_H
#define TESTRU_H
#include "resultunloader. h"
class TestRU : public ResultUnloader
{
public:
TestRU(DataManager *pdM);
virtual void unloadResult();
protected:
void _unloadNodes();
void _unloadEdges();
};
#endif // TESTRU_H
result_unloaders/ testru. cpp
#include "testru. h"
TestRU::TestRU(DataManager *pdM):
ResultUnloader(pdM)
{
}
void TestRU::unloadResult()
{
_unloadNodes();
_unloadEdges();
}
void TestRU::_unloadNodes()
{
RCCSettings *psett = RCCSettings::instance();
QString folder = psett->value( "ResultFolder", "r").toString();
QFile file(folder + "/result_pagedatas. csv");
file. open(QIODevice::WriteOnly);
QTextStream ts(&file);
ts << "id;site;normalizedUrl;url;idFrom;blocked;content;parsed;downloaded;errorCode;remove;outDegree;level;replaceId;downloadAttempts"
<< endl;
PDContainer pDs;
ulong id = 0;
_pdM->getPDsStartingFromId(pDs, id);
while(pDs. size())
{
for(const PageData& pD: pDs)
{
ts << pD. id << ";" << pD. phD->str + ";\"" << pD. normalizedUrl << "\";\"" << pD. url
<< "\";" << pD. idFrom << ";"<< pD. blocked << ";" << pD. content << ";"
<< pD. parsed << ";" <<pD. downloaded << ";" <<pD. errorCode << ";" << pD. remove << ";"
<< pD. outDegree << ";" << pD. level << ";" << pD. replaceId << ";" << pD. downloadAttempts << endl;
}
id = pDs. last().id + 1;
_pdM->getPDsStartingFromId(pDs, id);
}
file. close();
}
void TestRU::_unloadEdges()
{
RCCSettings *psett = RCCSettings::instance();
QString folder = psett->value( "ResultFolder", "r").toString();
QFile file(folder + "/result_pagearcs. csv");
file. open(QIODevice::WriteOnly);
QTextStream ts(&file);
ts << "id;from;to" << endl;
PAContainer pAs;
ulong id = 0;
_pdM->getPAsStartingFromId(pAs, id);
while(pAs. size())
{
for(const PageArc& pA: pAs)
{
ts << pA. toString() << endl;
}
id = pAs. last().id + 1;
_pdM->getPAsStartingFromId(pAs, id);
}
file. close();
}
robots_txt/dummyrobotstxt. h
#ifndef DUMMYROBOTSTXT_H
#define DUMMYROBOTSTXT_H
#include "robotstxt. h"
class DummyRobotsTxt : public RobotsTxt
{
public:
DummyRobotsTxt();
virtual bool allowUrl(const QString &url,
const RTRContainer &rules);
virtual RTRContainer readRules(const QString &siteUrl);
};
#endif // DUMMYROBOTSTXT_H
robots_txt/ dummyrobotstxt. cpp
#include "dummyrobotstxt. h"
DummyRobotsTxt::DummyRobotsTxt()
{
}
bool DummyRobotsTxt::allowUrl(const QString &url, const RTRContainer &rules)
{
return true;
}
RTRContainer DummyRobotsTxt::readRules(const QString &siteUrl)
{
return RTRContainer();
}
robots_txt/onlydisallowrobotstxt. h
#ifndef ONLYDISALLOWROBOTSTXT_H
#define ONLYDISALLOWROBOTSTXT_H
#include "robotstxt. h"
#include "rccconsts. h"
class OnlyDisallowRobotsTxt : public RobotsTxt
{
public:
OnlyDisallowRobotsTxt();
virtual ~OnlyDisallowRobotsTxt();
virtual bool allowUrl(const QString &url,
const RTRContainer &rules);
virtual RTRContainer readRules(const QString &siteUrl);
};
#endif // ONLYDISALLOWROBOTSTXT_H
robots_txt/ onlydisallowrobotstxt. cpp
#include "onlydisallowrobotstxt. h"
OnlyDisallowRobotsTxt::OnlyDisallowRobotsTxt()
{
}
OnlyDisallowRobotsTxt::~OnlyDisallowRobotsTxt()
{
}
bool OnlyDisallowRobotsTxt::allowUrl(const QString &url, const RTRContainer &rules)
{
for(const RobotsTxtRule &rule: rules)
{
if (rule. type == RobotsTxtRule::Disallow && url. startsWith(rule. templateStr))
{
return false;
}
}
return true;
}
RTRContainer OnlyDisallowRobotsTxt::readRules(const QString &siteUrl)
{
RTRContainer rules;
QNetworkAccessManager nam;
QUrl url(siteUrl + "/robots. txt");
QNetworkRequest request(url);
request. setRawHeader("User-Agent", RCCConsts::USER_AGENT);
QNetworkReply* preply = nam. get(request);
QEventLoop loop;
QObject::connect(&nam, SIGNAL(finished(QNetworkReply*)), &loop, SLOT(quit()));
loop. exec();
if (preply->error() != QNetworkReply::NoError)
return rules;
QString content(preply->readAll());
preply->deleteLater();
QStringList lines = content. split(QRegExp("[\r\n]"),QString::SkipEmptyParts);
bool inRules = false;
for (QString &line: lines)
{
QStringList parts = line. split(":");
if (parts. size() == 2)
{
QString part0 = parts. at(0).trimmed().toLower();
QString part1 = parts. at(1).trimmed();
if (inRules)
{
if (part0 == "user-agent")
break;
if (part0 == "disallow" && part1.length())
{
int hashPos = part1.indexOf("#");
if (hashPos!= -1)
{
part1 = part1.left(hashPos).trimmed();
}
int asteriskIndex = part1.indexOf("*");
if (asteriskIndex == -1)
{
rules. append(RobotsTxtRule(RobotsTxtRule::Disallow, part1));
}
else if (asteriskIndex == part1.length() - 1)
{
part1 = part1.mid(0, part1.length() - 1);
rules. append(RobotsTxtRule(RobotsTxtRule::Disallow, part1));
}
else
{
continue;
}
}
}
else
{
if (part0 == "user-agent" && part1 == "*")
inRules = true;
}
}
}
return rules;
}
robots_txt/robotstxt. h
#ifndef ROBOTSTXT_H
#define ROBOTSTXT_H
#include "data_structures/robotstxtrule. h"
class RobotsTxt
{
public:
RobotsTxt();
virtual ~RobotsTxt();
virtual bool allowUrl(const QString &url,
const RTRContainer &rules) = 0;
virtual RTRContainer readRules(const QString &siteUrl) = 0;
};
#endif // ROBOTSTXT_H
robots_txt/ robotstxt. cpp
#include "robotstxt. h"
|
Из за большого объема этот материал размещен на нескольких страницах:
1 2 3 4 5 6 7 8 9 |


