????????????
??????????????Web crawler?????????“????????????”???????????????????????????????????????????????????????????????????????????Щ??????????????????????????????????????????????????????????????????????????
????????·??
????1??????html??檔
????2?????????html????url????url?????????????hrefUrl???к?imgUrl?????С?
????3????????imgUrl?μ??????????????????????????url??????????
????4???ù??????????????????????е?url??
??????????????????
class Crawler
{
private:
string m_url;                     /// @brief ??????URL
queue<string> m_hrefUrl;          /// @brief ????????δ?????href
hash_set<string> m_visitedUrl;    /// @brief ??????????????????url
hash_set<string> m_visitedImg;    /// @brief ????????????????????
public:
/// @brief ????URL??????????????????
/// @param[in]  url ???????URL
/// @param[out] host ??????????host
/// @param[out]  resource ???????????????
bool parseURL(const string& url?? string& host?? string& resource);
/// @brief ???Get??????????????
/// @param[in]  url ?????url
/// @param[out] response ????????????????????
bool getHttpResponse(const string&url?? string *&response);
/// @brief ????????????????е?href?????URL
/// @param[in]  htmlResponse html????????
/// @param[out] imgurls ?洢????????url???
void htmlParse(string& htmlResponse?? vector<string>& imgurls);
/// @brief ??url?????????
/// @param[in] url ????????url
string toFileName(const string& url);
/// @brief ?????????img???????
/// @param[in] imgurls ????????url???
/// @param[in] url ?????????url???
void downLoadImg(vector<string>& imgurls?? const string& url);
/// @brief ???????url?μ???????
/// @param[in] url ???????????url
void bfs(const string& url);
/// @brief ??????????????url????
void start();
public:
/// @brief ??????
Crawler();
Crawler(const string &url);
~Crawler();
};
???????????????????÷?????ο???https://github.com/yongssu/crawler
??????????????????????????????????ЩС??????????????????????????????????£?????????????????·????????????檔