_edgesFiles. insert(phost, pedges);

  _eTSs. insert(phost, peTS);

  _pagesUnloaded. insert(phost, 0);

  }

}

void GephiSiteGraphRU::_unloadNodes()

{

  for (QTextStream* ts: _nTSs)

  {

  *ts << NODES_CSV_FILE_HEADER << endl;

  }

  PDContainer pDs;

  ulong id = 0;

  _pdM->getPDsStartingFromId(pDs, id);

  while(pDs. size())

  {

  for(const PageData& pD: pDs)

  {

  if (!pD. parsed)

  continue;

  ulong gephiId = _pagesUnloaded. value(pD. phD);

  _innerIdGephiId. insert(pD. id, gephiId);

  QTextStream *ts = _nTSs. value(pD. phD);

  *ts << gephiId << ";" << "\"" + pD. normalizedUrl + "\"" << ";"

  << pD. level << ";" << pD. outDegree << endl;

  gephiId++;

  _pagesUnloaded. insert(pD. phD, gephiId);

  _idHost. insert(pD. id, pD. phD);

  }

  id = pDs. last().id + 1;

  _pdM->getPDsStartingFromId(pDs, id);

  }

}

void GephiSiteGraphRU::_unloadEdges()

{

  for (QTextStream* ts: _eTSs)

  {

  *ts << EDGES_CSV_FILE_HEADER << endl;

  }

  PAContainer pAs;

  ulong id = 0;

  _pdM->getPAsStartingFromId(pAs, id);

  while(pAs. size())

  {

  for(const PageArc& pA: pAs)

  {

  ulong to = _innerIdGephiId. value(pA. to, 0);

  ulong from = _innerIdGephiId. value(pA. from, 0);

НЕ нашли? Не то? Что вы ищете?

  if (!(from || to))

  continue;

  HostData* phD = _idHost. value(pA. to, nullptr);

  if (phD == nullptr)

  continue;

  QTextStream *ts = _eTSs. value(phD);

  *ts << from << ";" << to << endl;

  }

  id = pAs. last().id + 1;

  _pdM->getPAsStartingFromId(pAs, id);

  }

}

void GephiSiteGraphRU::_endUnload()

{

  for (auto v: _nodesFiles)

  {

  v->close();

  delete v;

  }

  for (auto v: _edgesFiles)

  {

  v->close();

  delete v;

  }

  for (auto v: _nTSs)

  {

  delete v;

  }

  for (auto v: _eTSs)

  {

  delete v;

  }

  _nodesFiles. clear();

  _edgesFiles. clear();

  _nTSs. clear();

  _eTSs. clear();

  _hosts. clear();

  _innerIdGephiId. clear();

  _pagesUnloaded. clear();

}

result_unloaders/resultunloader. h

#ifndef RESULTUNLOADER_H

#define RESULTUNLOADER_H

#include "includes. h"

#include "data_managers/datamanager. h"

class ResultUnloader

{

public:

  ResultUnloader(DataManager *pdM);

  virtual ~ResultUnloader();

  virtual void unloadResult() = 0;

protected:

  DataManager *_pdM;

};

#endif // RESULTUNLOADER_H

result_unloaders/ resultunloader. cpp

#include "resultunloader. h"

ResultUnloader::ResultUnloader(DataManager *pdM):

  _pdM(pdM)

{

}

ResultUnloader::~ResultUnloader()

{

}

result_unloaders/testru. h

#ifndef TESTRU_H

#define TESTRU_H

#include "resultunloader. h"

class TestRU : public ResultUnloader

{

public:

  TestRU(DataManager *pdM);

  virtual void unloadResult();

protected:

  void _unloadNodes();

  void _unloadEdges();

};

#endif // TESTRU_H

result_unloaders/ testru. cpp

#include "testru. h"

TestRU::TestRU(DataManager *pdM):

  ResultUnloader(pdM)

{

}

void TestRU::unloadResult()

{

  _unloadNodes();

  _unloadEdges();

}

void TestRU::_unloadNodes()

{

  RCCSettings *psett = RCCSettings::instance();

  QString folder = psett->value( "ResultFolder", "r").toString();

  QFile file(folder + "/result_pagedatas. csv");

  file. open(QIODevice::WriteOnly);

  QTextStream ts(&file);

  ts << "id;site;normalizedUrl;url;idFrom;blocked;content;parsed;downloaded;errorCode;remove;outDegree;level;replaceId;downloadAttempts"

  << endl;

  PDContainer pDs;

  ulong id = 0;

  _pdM->getPDsStartingFromId(pDs, id);

  while(pDs. size())

  {

  for(const PageData& pD: pDs)

  {

  ts << pD. id << ";" << pD. phD->str + ";\"" << pD. normalizedUrl << "\";\"" << pD. url

  << "\";" << pD. idFrom << ";"<< pD. blocked << ";" << pD. content << ";"

  << pD. parsed << ";" <<pD. downloaded << ";"  <<pD. errorCode << ";"  << pD. remove << ";"

  << pD. outDegree << ";"  << pD. level << ";" << pD. replaceId << ";" << pD. downloadAttempts << endl;

  }

  id = pDs. last().id + 1;

  _pdM->getPDsStartingFromId(pDs, id);

  }

  file. close();

}

void TestRU::_unloadEdges()

{

  RCCSettings *psett = RCCSettings::instance();

  QString folder = psett->value( "ResultFolder", "r").toString();

  QFile file(folder + "/result_pagearcs. csv");

  file. open(QIODevice::WriteOnly);

  QTextStream ts(&file);

  ts << "id;from;to" << endl;

  PAContainer pAs;

  ulong id = 0;

  _pdM->getPAsStartingFromId(pAs, id);

  while(pAs. size())

  {

  for(const PageArc& pA: pAs)

  {

  ts << pA. toString() << endl;

  }

  id = pAs. last().id + 1;

  _pdM->getPAsStartingFromId(pAs, id);

  }

  file. close();

}

robots_txt/dummyrobotstxt. h

#ifndef DUMMYROBOTSTXT_H

#define DUMMYROBOTSTXT_H

#include "robotstxt. h"

class DummyRobotsTxt : public RobotsTxt

{

public:

  DummyRobotsTxt();

  virtual bool allowUrl(const QString &url,

  const RTRContainer &rules);

  virtual RTRContainer readRules(const QString &siteUrl);

};

#endif // DUMMYROBOTSTXT_H

robots_txt/ dummyrobotstxt. cpp

#include "dummyrobotstxt. h"

DummyRobotsTxt::DummyRobotsTxt()

{

}

bool DummyRobotsTxt::allowUrl(const QString &url, const RTRContainer &rules)

{

  return true;

}

RTRContainer DummyRobotsTxt::readRules(const QString &siteUrl)

{

  return RTRContainer();

}

robots_txt/onlydisallowrobotstxt. h

#ifndef ONLYDISALLOWROBOTSTXT_H

#define ONLYDISALLOWROBOTSTXT_H

#include "robotstxt. h"

#include "rccconsts. h"

class OnlyDisallowRobotsTxt : public RobotsTxt

{

public:

  OnlyDisallowRobotsTxt();

  virtual ~OnlyDisallowRobotsTxt();

  virtual bool allowUrl(const QString &url,

  const RTRContainer &rules);

  virtual RTRContainer readRules(const QString &siteUrl);

};

#endif // ONLYDISALLOWROBOTSTXT_H

robots_txt/ onlydisallowrobotstxt. cpp

#include "onlydisallowrobotstxt. h"

OnlyDisallowRobotsTxt::OnlyDisallowRobotsTxt()

{

}

OnlyDisallowRobotsTxt::~OnlyDisallowRobotsTxt()

{

}

bool OnlyDisallowRobotsTxt::allowUrl(const QString &url, const RTRContainer &rules)

{

  for(const RobotsTxtRule &rule: rules)

  {

  if (rule. type == RobotsTxtRule::Disallow && url. startsWith(rule. templateStr))

  {

  return false;

  }

  }

  return true;

}

RTRContainer OnlyDisallowRobotsTxt::readRules(const QString &siteUrl)

{

  RTRContainer rules;


  QNetworkAccessManager nam;

  QUrl url(siteUrl + "/robots. txt");

  QNetworkRequest request(url);

  request. setRawHeader("User-Agent", RCCConsts::USER_AGENT);

  QNetworkReply* preply = nam. get(request);

  QEventLoop loop;

  QObject::connect(&nam, SIGNAL(finished(QNetworkReply*)), &loop, SLOT(quit()));

  loop. exec();

  if (preply->error() != QNetworkReply::NoError)

  return rules;

  QString content(preply->readAll());

  preply->deleteLater();

  QStringList lines = content. split(QRegExp("[\r\n]"),QString::SkipEmptyParts);

  bool inRules = false;

  for (QString &line: lines)

  {

  QStringList parts = line. split(":");

  if (parts. size() == 2)

  {

  QString part0 = parts. at(0).trimmed().toLower();

  QString part1 = parts. at(1).trimmed();

  if (inRules)

  {

  if (part0 == "user-agent")

  break;

  if (part0 == "disallow" && part1.length())

  {

  int hashPos = part1.indexOf("#");

  if (hashPos!= -1)

  {

  part1 = part1.left(hashPos).trimmed();

  }

  int asteriskIndex = part1.indexOf("*");

  if (asteriskIndex == -1)

  {

  rules. append(RobotsTxtRule(RobotsTxtRule::Disallow, part1));

  }

  else if (asteriskIndex == part1.length() - 1)

  {

  part1 = part1.mid(0, part1.length() - 1);

  rules. append(RobotsTxtRule(RobotsTxtRule::Disallow, part1));

  }

  else

  {

  continue;

  }

  }

  }

  else

  {

  if (part0 == "user-agent" && part1 == "*")

  inRules = true;

  }

  }

  }

  return rules;

}

robots_txt/robotstxt. h

#ifndef ROBOTSTXT_H

#define ROBOTSTXT_H

#include "data_structures/robotstxtrule. h"

class RobotsTxt

{

public:

  RobotsTxt();

  virtual ~RobotsTxt();

  virtual bool allowUrl(const QString &url,

  const RTRContainer &rules) = 0;

  virtual RTRContainer readRules(const QString &siteUrl) = 0;

};

#endif // ROBOTSTXT_H

robots_txt/ robotstxt. cpp

#include "robotstxt. h"

Из за большого объема этот материал размещен на нескольких страницах:
1 2 3 4 5 6 7 8 9