2021-8-9 补充:
首先我得说一下:垃圾CSDN
太垃圾了。
CSDN 上那么多关于 C++ 获取网页数据的代码,几乎全都是雷同的
你说离谱不离谱?看来看去就是那几段代码在重复。更离谱的是,有的人复制都没复制全,代码都运行不起来
真的是浪费时间。
最后总算是找到能用的了。
由于网上有好多是用 MFC 的库做的,但是对于 Console 项目来说用不了,所以我找了另外几种解决方案。
下面这些代码我都是改过的,调整了一下参数和返回值,用起来更方便。
1 -- 使用<wininet.h>系列函数
不多说上代码,VS2019 运行通过,以下是完整代码,可以直接运行。
里面用到了 utf8 和 gbk 编码转换的函数,见 http://huidong.xyz/index.php?mode=2&id=298
有的网页可能获取不到,原因不详,但是属于少数吧。
建议:可以将下面的代码中的 main 函数和 stdio.h 的引用删除,打包成 web.h。
#include <windows.h>
#include <wininet.h>
#include <stdio.h>
#include <string>
using namespace std;
#pragma comment(lib, "wininet.lib")
/**
* @brief UTF-8 编码字符串转 GBK 编码字符串
* @param[in] lpUTF8Str: 原 utf-8 字符串
* @param[out] lpGBKStr: 转码后的 gbk 字符串
* @param[in] nGBKStrLen: gbk 字符串的最大长度
* @return 返回转换后字符串的长度
* @note 代码来自 https://www.cnblogs.com/zhongbin/p/3160641.html
*/
int UTF8ToGBK(char* lpUTF8Str, char* lpGBKStr, int nGBKStrLen)
{
wchar_t* lpUnicodeStr = NULL;
int nRetLen = 0;
if (!lpUTF8Str) return 0;
nRetLen = ::MultiByteToWideChar(CP_UTF8, 0, (char*)lpUTF8Str, -1, NULL, NULL);
lpUnicodeStr = new WCHAR[nRetLen + 1];
nRetLen = ::MultiByteToWideChar(CP_UTF8, 0, (char*)lpUTF8Str, -1, lpUnicodeStr, nRetLen);
if (!nRetLen) return 0;
nRetLen = ::WideCharToMultiByte(CP_ACP, 0, lpUnicodeStr, -1, NULL, NULL, NULL, NULL);
if (!lpGBKStr)
{
if (lpUnicodeStr) delete[] lpUnicodeStr;
return nRetLen;
}
if (nGBKStrLen < nRetLen)
{
if (lpUnicodeStr) delete[] lpUnicodeStr;
return 0;
}
nRetLen = ::WideCharToMultiByte(CP_ACP, 0, lpUnicodeStr, -1, (char*)lpGBKStr, nRetLen, NULL, NULL);
if (lpUnicodeStr) delete[] lpUnicodeStr;
return nRetLen;
}
/**
* @brief 判断一个字符串是否为 UTF-8 编码
* @note 来自 https://blog.csdn.net/jiankekejian/article/details/106720432 (有删改)
*/
bool isUTF8(const char* str)
{
int length = strlen(str);
int check_sub = 0;
int i = 0;
int j = 0;
for (i = 0; i < length; i++)
{
if (check_sub == 0)
{
if ((str[i] >> 7) == 0)
{
continue;
}
struct
{
int cal;
int cmp;
} Utf8NumMap[] = { {0xE0,0xC0},{0xF0,0xE0},{0xF8,0xF0},{0xFC,0xF8},{0xFE,0xFC}, };
for (j = 0; j < (sizeof(Utf8NumMap) / sizeof(Utf8NumMap[0])); j++)
{
if ((str[i] & Utf8NumMap[j].cal) == Utf8NumMap[j].cmp)
{
check_sub = j + 1;
break;
}
}
if (0 == check_sub)
{
return false;
}
}
else
{
if ((str[i] & 0xC0) != 0x80)
{
return false;
}
check_sub--;
}
}
return true;
}
/**
* @brief 获取网页源码
* @param[in] Url 网页链接
* @param[in] 是否强制转换为 GBK 编码
* @return 返回网页源码
* @note 代码来自 https://www.cnblogs.com/croot/p/3391003.html (有删改)
*/
string GetWebSrcCode(LPCTSTR Url, bool isGBK = true)
{
string strHTML;
HINTERNET hSession = InternetOpen(L"IE6.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (hSession != NULL)
{
HINTERNET hURL = InternetOpenUrl(hSession, Url, NULL, 0, INTERNET_FLAG_DONT_CACHE, 0);
if (hURL != NULL)
{
const int nBlockSize = 1024;
char Temp[nBlockSize] = { 0 };
ULONG Number = 1;
while (Number > 0)
{
InternetReadFile(hURL, Temp, nBlockSize - 1, &Number);
for (int i = 0; i < (int)Number; i++)
strHTML += Temp[i];
}
InternetCloseHandle(hURL);
hURL = NULL;
}
InternetCloseHandle(hSession);
hSession = NULL;
}
if (isGBK && isUTF8(strHTML.c_str()))
{
string strGBK;
strGBK.resize(strHTML.size() * 2);
UTF8ToGBK(&strHTML[0], &strGBK[0], strHTML.size() * 2);
strGBK.resize(strlen(strGBK.c_str())); // 删除多余 \0
return strGBK;
}
else
{
return strHTML;
}
}
int main(int argc, TCHAR* argv[])
{
string str = GetWebSrcCode(L"http://www.huidong.xyz/");
printf("%s", str.c_str());
return 0;
}
2 -- 使用 win socket
来自 http://www.vcgood.com/archives/3759
和上面第一种一样,都使用了 utf8 和 gbk 的转换函数,见 http://huidong.xyz/index.php?mode=2&id=298
VS2019 实测可用
缺点:测试发现不能爬取的网站比第一种要多,比如 cnblogs.com 就爬不了。但是 www.baidu.com 什么的可以。
#include <stdio.h>
#include <winsock.h>
#include <string>
#pragma comment(lib, "ws2_32.lib")
using namespace std;
/**
* @brief UTF-8 编码字符串转 GBK 编码字符串
* @param[in] lpUTF8Str: 原 utf-8 字符串
* @param[out] lpGBKStr: 转码后的 gbk 字符串
* @param[in] nGBKStrLen: gbk 字符串的最大长度
* @return 返回转换后字符串的长度
* @note 代码来自 https://www.cnblogs.com/zhongbin/p/3160641.html
*/
int UTF8ToGBK(char* lpUTF8Str, char* lpGBKStr, int nGBKStrLen)
{
wchar_t* lpUnicodeStr = NULL;
int nRetLen = 0;
if (!lpUTF8Str) return 0;
nRetLen = ::MultiByteToWideChar(CP_UTF8, 0, (char*)lpUTF8Str, -1, NULL, NULL);
lpUnicodeStr = new WCHAR[nRetLen + 1];
nRetLen = ::MultiByteToWideChar(CP_UTF8, 0, (char*)lpUTF8Str, -1, lpUnicodeStr, nRetLen);
if (!nRetLen) return 0;
nRetLen = ::WideCharToMultiByte(CP_ACP, 0, lpUnicodeStr, -1, NULL, NULL, NULL, NULL);
if (!lpGBKStr)
{
if (lpUnicodeStr) delete[] lpUnicodeStr;
return nRetLen;
}
if (nGBKStrLen < nRetLen)
{
if (lpUnicodeStr) delete[] lpUnicodeStr;
return 0;
}
nRetLen = ::WideCharToMultiByte(CP_ACP, 0, lpUnicodeStr, -1, (char*)lpGBKStr, nRetLen, NULL, NULL);
if (lpUnicodeStr) delete[] lpUnicodeStr;
return nRetLen;
}
/**
* @brief 判断一个字符串是否为 UTF-8 编码
* @note 来自 https://blog.csdn.net/jiankekejian/article/details/106720432 (有删改)
*/
bool isUTF8(const char* str)
{
int length = strlen(str);
int check_sub = 0;
int i = 0;
int j = 0;
for (i = 0; i < length; i++)
{
if (check_sub == 0)
{
if ((str[i] >> 7) == 0)
{
continue;
}
struct
{
int cal;
int cmp;
} Utf8NumMap[] = { {0xE0,0xC0},{0xF0,0xE0},{0xF8,0xF0},{0xFC,0xF8},{0xFE,0xFC}, };
for (j = 0; j < (sizeof(Utf8NumMap) / sizeof(Utf8NumMap[0])); j++)
{
if ((str[i] & Utf8NumMap[j].cal) == Utf8NumMap[j].cmp)
{
check_sub = j + 1;
break;
}
}
if (0 == check_sub)
{
return false;
}
}
else
{
if ((str[i] & 0xC0) != 0x80)
{
return false;
}
check_sub--;
}
}
return true;
}
/**
* @brief 获取网页 HTML 源码
* @param[in] url: 网址
* @return 返回网页源码
* @note 代码来自 http://www.vcgood.com/archives/3759 (有删改)
*/
string geturl(const char* url)
{
string strHTML;
WSADATA WSAData = { 0 };
SOCKET sockfd;
struct sockaddr_in addr;
struct hostent* pURL;
char myurl[BUFSIZ];
char* pHost = 0, * pGET = 0;
char host[BUFSIZ], GET[BUFSIZ];
char header[BUFSIZ] = "";
int text_buf_size = 409600;
char* text = new char[text_buf_size];
memset(text, 0, text_buf_size);
if (WSAStartup(MAKEWORD(2, 2), &WSAData))
{
printf("WSA failed\n");
return {};
}
strcpy_s(myurl, 512, url);
for (pHost = myurl; *pHost != '/' && *pHost != '\0'; ++pHost);
if ((int)(pHost - myurl) == strlen(myurl))
strcpy_s(GET, 512, "/");
else
strcpy_s(GET, 512, pHost);
*pHost = '\0';
strcpy_s(host, 512, myurl);
sockfd = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
pURL = gethostbyname(host);
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = *((unsigned long*)pURL->h_addr);
addr.sin_port = htons(80);
strcat_s(header, 512, "GET ");
strcat_s(header, 512, GET);
strcat_s(header, 512, " HTTP/1.1\r\n");
strcat_s(header, 512, "HOST: ");
strcat_s(header, 512, host);
strcat_s(header, 512, "\r\nConnection: Close\r\n\r\n");
connect(sockfd, (SOCKADDR*)&addr, sizeof(addr));
send(sockfd, header, strlen(header), 0);
while (recv(sockfd, text, BUFSIZ, 0) > 0)
{
strHTML += text;
memset(text, 0, text_buf_size);
}
closesocket(sockfd);
WSACleanup();
delete[] text;
if (isUTF8(strHTML.c_str()))
{
string strGBK;
strGBK.resize(strHTML.size()*2);
UTF8ToGBK(&strHTML[0], &strGBK[0], strHTML.size() * 2);
return strGBK;
}
else
{
return strHTML;
}
}
int main()
{
char url[256] = { 0 };
printf("http://");
scanf_s("%s", url, 256);
string strHTML = geturl(url);
printf("%s", strHTML.c_str());
return 0;
}
------- 新增内容结束 ---------
这个是之前 ckj 写的,但是现在好像有点问题了:
代码:
#include <string.h>
#include <iostream>
#include <windows.h>
#include <sstream>
#pragma comment(lib,"ws2_32.lib")
using namespace std;
const int recvbuf_size = 204800000;
char recvbuf[recvbuf_size] = { 0 };
// 返回一个网页的HTML内容
// url 网页链接
string getweb(string url)
{
string ym, src;
if (url.find("http://") != string::npos)
{
if (url.find("/", 8) != string::npos)
{
ym = url.substr(7, url.find("/", 8) - 7);
src = url.substr(url.find("/", 8));
}
else
{
ym = url.substr(7);
src = "";
}
}
if (url.find("https://") != string::npos)
{
if (url.find("/", 9) != string::npos)
{
ym = url.substr(8, url.find("/", 9) - 8);
src = url.substr(url.find("/", 9));
}
else
{
ym = url.substr(8);
src = "";
}
}
SOCKET sever;
sockaddr_in severaddr;
WSADATA wsadata;
WSAStartup(MAKEWORD(2, 2), &wsadata);
sever = socket(PF_INET, SOCK_STREAM, 0);
memset(&severaddr, 0, sizeof(severaddr));
HOSTENT* host = (HOSTENT*)gethostbyname(ym.c_str());
severaddr.sin_family = AF_INET;
severaddr.sin_addr.S_un.S_addr = inet_addr(inet_ntoa(*(struct in_addr*)host->h_addr_list[0]));
severaddr.sin_port = htons(80);
connect(sever, (SOCKADDR*)&severaddr, sizeof(severaddr));
stringstream stream;
stream << "GET " << src << " HTTP/1.1\r\n";
stream << "Host: " << ym << "\r\n";
stream << "Connection: keep-alive\r\n";
stream << "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.48\r\n";
stream << "Accept - Encoding : gzip, deflate, br\r\n";
stream << "Accept - Language : zh - CN, zh; q = 0.9, en; q = 0.8, en - GB; q = 0.7, en - US; q = 0.6\r\n\r\n";
char sendbuf[2048];
memset(sendbuf, 0, sizeof(sendbuf));
stream.read(sendbuf, stream.tellp());
send(sever, sendbuf, sizeof(sendbuf), 0);
//cout << sendbuf << endl;
memset(recvbuf, 0, recvbuf_size);
recv(sever, recvbuf, recvbuf_size, 0);
//cout << recvbuf << endl;
string strrecv = recvbuf;
return strrecv;
}
int main()
{
// 爬取huidong网的首页
string web = getweb("http://www.huidong.xyz/index.php");
printf(web.c_str());
while(true);
return 0;
}
效果:
我发现如果把recvbuf改成局部变量,用new来分配内存的话获取到的网页内容就不太全,暂时未知原因。
有的网站爬不到,应该是请求头有问题。