return res;

}

uint PageData::hashContent(const QString &content)

{

  return qHash(content);

}

QString PageData::normalizeUrl(const QString &urlStr)

{

  QString ampReplaced(urlStr);

  //замена на случай кривого кода на сайте

  while(ampReplaced. indexOf("&") != -1)

  {

  ampReplaced. replace("&", "&");

  }

  ampReplaced. replace(""", "%22");

  QUrl url(ampReplaced);

  url = url. adjusted(QUrl::RemoveFragment | QUrl::NormalizePathSegments);

  QUrlQuery query(url);

  QList<QPair<QString, QString> > list = query. queryItems();

  std::sort(list. begin(), list. end(),

  [](const QPair<QString, QString> &a, const QPair<QString, QString> &b)

  {return a. first < b. first;}

  );

  QString queryString = "";

  for(auto item: list)

  {

  if (queryString. length())

  queryString += "&";

  if (item. first. length())

  queryString += item. first + "=" +item. second;

  }

  QString res;

  res = url. path();

  if (queryString. length())

  res += '?' + queryString;

  if ((!res. length()) || res. at(0) != '/')

  res = "/" + res;

  return res;

}

std::size_t hash_value(const QString &x) { return qHash(x); }

data_structures/pastore. h

#ifndef PASTORE

#define PASTORE

#include "pagearc. h"

using namespace boost::multi_index;

НЕ нашли? Не то? Что вы ищете?

namespace PA

{

struct ById{};

struct ByFromAndTo{};

struct ByTo{};

struct ByFrom{};

struct FromChange : public std::unary_function<PageArc, void>

{

  ulong from;

  FromChange(const ulong& _from) : from(_from) {}

  void operator()(PageArc& pA)

  {

  pA. from = from;

  }

};

struct ToChange : public std::unary_function<PageArc, void>

{

  ulong to;

  ToChange(const ulong& _to) : to(_to) {}

  void operator()(PageArc& pA)

  {

  pA. to = to;

  }

};

}

typedef boost::multi_index_container<PageArc,

  indexed_by<

  ordered_unique<

  tag<PA::ById>, member<PageArc, ulong, &PageArc::id>

  >,

  ordered_non_unique<

  tag<PA::ByTo>, member<PageArc, ulong, &PageArc::to>

  >,

  ordered_non_unique<

  tag<PA::ByFrom>, member<PageArc, ulong, &PageArc::from>

  >,

  hashed_unique<

  tag<PA::ByFromAndTo>, composite_key<

  PageArc,

  member<PageArc, ulong,&PageArc::from>,

  member<PageArc, ulong,&PageArc::to>

  >

  >

  >

> PAStore;

#endif // PASTORE

data_structures/pdlogitem. h

#ifndef PDLOGITEM_H

#define PDLOGITEM_H

#include "pagedata. h"

struct PDLogItem

{

  PDLogItem();

  PDLogItem(const QString &message, const PageData &pD);

  QString message;

  PageData pD;

};

#endif // PDLOGITEM_H

data_structures/ pdlogitem. cpp

#include "pdlogitem. h"

PDLogItem::PDLogItem() :

  message(""),

  pD(PageData())

{

}

PDLogItem::PDLogItem(const QString &message, const PageData &pD):

  message(message),

  pD(pD)

{

}

data_structures/pdpacreatedata. h

#ifndef PDANDPACREATEDATA_H

#define PDANDPACREATEDATA_H

#include "pagedata. h"

struct PDPACreateData

{

  PDPACreateData();

  PDPACreateData(const QString &url, const QString &normalizedUrl, const PageData &pD);

  QString url;

  QString normalizedUrl;

  PageData pD;

};

#endif // PDANDPACREATEDATA_H

data_structures/ pdpacreatedata. cpp

#include "pdpacreatedata. h"

PDPACreateData::PDPACreateData():

  url(""),

  normalizedUrl(""),

  pD(PageData())

{

}

PDPACreateData::PDPACreateData(const QString &url, const QString &normalizedUrl,

  const PageData &pD) :

  url(url),

  normalizedUrl(normalizedUrl),

  pD(pD)

{

}

data_structures/pdstore. h

#ifndef PDSTORE_H

#define PDSTORE_H

#include "includes. h"

#include "pagedata. h"

using namespace boost::multi_index;

namespace PD

{

struct ById{};

struct ByBlocked{};

struct ByPhDAndBlockedAndDownloaded{};

struct ByBlockedAndDownloadedAndParsedAndErrorCode{};

struct ByPhDAndContentHash{};

struct ByUrlAndPhD{};

struct ByPhDAndNormalizedUrl{};

struct ByIdFrom{};

struct BlockedChange : public std::unary_function<PageData, void>

{

  bool b;

  BlockedChange(const bool& _b) : b(_b) {}

  void operator()(PageData& pd)

  {

  pd. blocked = b;

  }

};

struct LevelChange : public std::unary_function<PageData, void>

{

  uint l;

  LevelChange(const uint& _l) : l(_l) {}

  void operator()(PageData& pd)

  {

  pd. level = l;

  }

};

}

typedef boost::multi_index_container<PageData,

  indexed_by<

  ordered_non_unique<

  tag<PD::ById>, member<PageData, ulong, &PageData::id>

  >,

  ordered_non_unique<

  tag<PD::ByBlocked>, member<PageData, bool, &PageData::blocked>

  >,

  ordered_non_unique<

  tag<PD::ByPhDAndBlockedAndDownloaded>, composite_key<

  PageData,

  member<PageData, HostData*,&PageData::phD>,

  member<PageData, bool,&PageData::blocked>,

  member<PageData, bool,&PageData::downloaded>

  >

  >,

  ordered_non_unique<

  tag<PD::ByBlockedAndDownloadedAndParsedAndErrorCode>, composite_key<

  PageData,

  member<PageData, bool,&PageData::blocked>,

  member<PageData, bool,&PageData::downloaded>,

  member<PageData, bool,&PageData::parsed>,

  member<PageData, uint,&PageData::errorCode>

  >

  >,

  ordered_non_unique<

  tag<PD::ByPhDAndContentHash>, composite_key<

  PageData,

  member<PageData, HostData*,&PageData::phD>,

  member<PageData, uint,&PageData::contentHash>

  >

  >,

  ordered_unique<

  tag<PD::ByPhDAndNormalizedUrl>, composite_key<

  PageData,

  member<PageData, HostData*,&PageData::phD>,

  member<PageData, QString,&PageData::normalizedUrl>

  >

  >,

  ordered_non_unique<

  tag<PD::ByIdFrom>, member<PageData, ulong, &PageData::idFrom>

  >

  >

> PDStore;

typedef PDStore::index<PD::ByBlocked>::type BlockedList;

#endif // PDSTORE_H

data_structures/robotstxtrule/.h

#ifndef ROBOTSTXTRULES_H

#define ROBOTSTXTRULES_H

#include "includes. h"

struct RobotsTxtRule

{

  enum RuleType

  {

  Allow,

  Disallow

  };

  RobotsTxtRule();

  RobotsTxtRule(const RobotsTxtRule::RuleType &type,

  const QString &templ);

  RobotsTxtRule::RuleType type;

  QString templateStr;

};

typedef QVector<RobotsTxtRule> RTRContainer;

#endif // ROBOTSTXTRULES_H

data_structures/ robotstxtrule. cpp

#include "robotstxtrule. h"

RobotsTxtRule::RobotsTxtRule(){}


RobotsTxtRule::RobotsTxtRule(const RobotsTxtRule::RuleType &type,

  const QString &templ):

  type(type),

  templateStr(templ)

{

}

result_unloaders/gephisitegragh. h

#ifndef GEPHISITEGRAPHRU_H

#define GEPHISITEGRAPHRU_H

#include "resultunloader. h"

class GephiSiteGraphRU : public ResultUnloader

{

public:

  GephiSiteGraphRU(DataManager *pdM);

  ~GephiSiteGraphRU();

  virtual void unloadResult();

protected:

  void _manageHosts();

  void _unloadNodes();

  void _unloadEdges();

  void _endUnload();

  QHash<ulong, ulong> _innerIdGephiId;

  QVector<HostData*> _hosts;

  QMap<HostData*, QFile*> _nodesFiles;

  QMap<HostData*, QTextStream*> _nTSs;

  QMap<HostData*, QFile*> _edgesFiles;

  QMap<HostData*, QTextStream*> _eTSs;

  QMap<HostData*, ulong> _pagesUnloaded;

  QMap<ulong, HostData*> _idHost;

  const QString NODES_CSV_FILE_HEADER = "id;label;level;out_degree";

  const QString EDGES_CSV_FILE_HEADER = "source;target";

};

#endif // GEPHISITEGRAPHRU_H

result_unloaders/ gephisitegragh. cpp

#include "gephisitegraphru. h"

GephiSiteGraphRU::GephiSiteGraphRU(DataManager *pdM):

  ResultUnloader(pdM)

{

}

GephiSiteGraphRU::~GephiSiteGraphRU()

{

}

void GephiSiteGraphRU::unloadResult()

{

  _manageHosts();

  _unloadNodes();

  _unloadEdges();

  _endUnload();

}

void GephiSiteGraphRU::_manageHosts()

{

  RCCSettings *psett = RCCSettings::instance();

  QString folder = psett->value( "ResultFolder", "r").toString();

  _hosts = _pdM->getHosts();

  for (HostData* phost: _hosts)

  {

  QString fNPart = phost->protocol + "_" + phost->host + ".csv";


  QFile* pnodes = new QFile(folder + "/nodes_" + fNPart);

  pnodes->open(QIODevice::WriteOnly);

  QTextStream* pnTS = new QTextStream(pnodes);

  _nodesFiles. insert(phost, pnodes);

  _nTSs. insert(phost, pnTS);

  QFile* pedges = new QFile(folder + "/edges_" + fNPart);

  pedges->open(QIODevice::WriteOnly);

  QTextStream* peTS = new QTextStream(pedges);

Из за большого объема этот материал размещен на нескольких страницах:
1 2 3 4 5 6 7 8 9