-
C#教程之c#远程html数据抓取实例分享
代码如下:
/// <summary>
/// 获取远程html
/// </summary>
/// <param name="url"></param>
/// <param name="methed"></param>
/// <param name="param"></param>
/// <param name="html"></param>
/// <returns></returns>
public static bool GetHttp(string url, string methed, string param, out string html)
{
methed = methed.ToLower();
if (param != null && methed == "get" && param.Length > 0)
{
url += "?" + param;
}
try
{
MSXML2.XMLHTTP mx = new MSXML2.XMLHTTPClass();
mx.open(methed, url, false, null, null);
if (param != null && methed == "post" && param.Length > 0)
{
mx.setRequestHeader("Content-Length", param.Length.ToString());
mx.setRequestHeader("Content-Type", "application/x-www-form-urlencoded");
}
mx.send(param);
if (mx.readyState != 4)
{
html = "远程连接失败:-4";
return false;
}
html = mx.responseText;
return true;
}
catch (Exception ex)
{
html = "远程连接失败:"+ex.Message;
return false;
}
}
public static bool GetHttp1(string url, string methed, string param, string referer, string encode, out string html)
{
//return GetHttp(url,methed,param,out html);
//string encode = "utf-8";
//string methed = sendType.ToString();
if (param != null && methed == "get" && param.Length > 0)
{
if (url.IndexOf("?") >= 0)
{
url += "&" + param;
}
else
{
url += "?" + param;
}
}
try
{
HttpWebRequest webreq = (HttpWebRequest)WebRequest.Create(url);
webreq.Proxy=null;
webreq.Timeout = 1000 * 6;
webreq.ContentType = "application/x-www-form-urlencoded";
webreq.UserAgent = "User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0";
//webreq.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)";
//谷歌的:User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36
//火狐的:User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0
//标准格式为: 浏览器标识 (操作系统标识; 加密等级标识; 浏览器语言) 渲染引擎标识 版本信息
//webreq.AllowAutoRedirect = false;
//频繁请求一个网址时,过段时间就会出现“基础连接已经关闭”
//webreq.KeepAlive = false;
//webreq.ProtocolVersion = HttpVersion.Version10;
if (referer.Length > 0)
{
webreq.Referer = referer;
}
CookieContainer mycookies = new CookieContainer();
webreq.CookieContainer = mycookies;
//if (this.cookieList != null)
//{
// webreq.CookieContainer.Add(this.GetCookies(webreq.RequestUri, this.cookieList));
//}
webreq.Method = methed;
//post 开始
if (param != null && methed == "post")
{
byte[] arrbyte = Encoding.GetEncoding(encode).GetBytes(param);
webreq.ContentLength = arrbyte.Length;
Stream newStream = webreq.GetRequestStream();
newStream.Write(arrbyte, 0, arrbyte.Length);
newStream.Close();
}
//post 结束
WebResponse w = webreq.GetResponse();
//返回HTML
using (HttpWebResponse webres = (HttpWebResponse)webreq.GetResponse())
{
using (Stream dataStream = webres.GetResponseStream())
{
using (StreamReader reader = new StreamReader(dataStream, Encoding.GetEncoding(encode)))
{
html = reader.ReadToEnd();
//this.cookieList = webreq.CookieContainer.GetCookies(webreq.RequestUri);
webreq.Abort();//可能会解决卡住或阻塞问题
}
}
}
}
catch (Exception ex)
{
html = "出现异常(HttpHelper.GetHTML),远程连接失败:" + ex.Message + " url:" + url;
//System.Windows.Forms.MessageBox.Show(html);
return false;
}
return true;
}
栏目列表
最新更新
nodejs爬虫
Python正则表达式完全指南
爬取豆瓣Top250图书数据
shp 地图文件批量添加字段
爬虫小试牛刀(爬取学校通知公告)
【python基础】函数-初识函数
【python基础】函数-返回值
HTTP请求:requests模块基础使用必知必会
Python初学者友好丨详解参数传递类型
如何有效管理爬虫流量?
SQL SERVER中递归
2个场景实例讲解GaussDB(DWS)基表统计信息估
常用的 SQL Server 关键字及其含义
动手分析SQL Server中的事务中使用的锁
openGauss内核分析:SQL by pass & 经典执行
一招教你如何高效批量导入与更新数据
天天写SQL,这些神奇的特性你知道吗?
openGauss内核分析:执行计划生成
[IM002]Navicat ODBC驱动器管理器 未发现数据
初入Sql Server 之 存储过程的简单使用
这是目前我见过最好的跨域解决方案!
减少回流与重绘
减少回流与重绘
如何使用KrpanoToolJS在浏览器切图
performance.now() 与 Date.now() 对比
一款纯 JS 实现的轻量化图片编辑器
关于开发 VS Code 插件遇到的 workbench.scm.
前端设计模式——观察者模式
前端设计模式——中介者模式
创建型-原型模式