//由于我所抓取的网页有多个困难,1、以上三种无法获取到源码,无论设置何种头部请求都没用,2、单独只用webbrowser无法进行翻页操作。所以用webbrowser与IE结合来抓取
//本项目流程——先打开IE、再用MSHtml里的方法操作IE表单,进行翻页,将列表中的网址在webbrowser一一打开,这样才获取得到源码。
这个项目的意义在于,无论网站是何种方式加载,都可以抓取到内容。
项目源码在文章最后。
string[] province = { "北京市", "天津市", "河北省", "山西省", "内蒙古", "辽宁省", "吉林省", "黑龙江省", "上海市", "江苏省", "浙江省", "安徽省", "福建省", "江西省", "山东省", "河南省", "湖北省", "湖南省", "广东省", "广西壮族", "海南省", "重庆市", "四川省", "贵州省", "云南省", "西藏", "陕西省", "甘肃省", "青海省", "宁夏回族", "新疆维吾尔", "新疆建设兵团" };
int[] provinceCode = { 11, 12, 13, 14, 15, 21, 22, 23, 31, 32, 33, 34, 35, 36, 37, 41, 42, 43, 44, 45, 45, 50, 51, 52, 53, 54, 61, 62, 63, 64, 65, 66 };
//private Thread Thread_land;
private void button1_Click(object sender, EventArgs e)
{
//调用webBrowser中的js函数
//if (webBrowser1.Document != null)
//{
// HtmlDocument doc = webBrowser1.Document;
// //无参调用
// doc.InvokeScript("sdf2");
//}
new Action(appStart).BeginInvoke(null, null);
//Thread Thread_land = new Thread(new ThreadStart(appStart));
//Thread_land.Start();
//appStart();
}
string TaskProgress = "";//任务进度
private void appStart()
{
try
{
//Action at = new Action(delegate() { label1.Text += "采集开始;\n"; });
Action at = new Action(() => { textBox1.Text = "采集开始;\r\n"; labCount.Text = "0"; timer1.Enabled = true; labStartTime.Text =DateTime.Now.ToString(); });
this.Invoke(at);
//获取采集进度
int proIndex = 0;
int yearIndex = 2011;
int pageIndex = 2;
LandDB.BLL.CJLog cjlogBLL = new LandDB.BLL.CJLog();
TaskProgress = cjlogBLL.GetTaskProgress();
if (TaskProgress != "")
{
string[] s = TaskProgress.Split(',');
ArrayList str = new ArrayList(province);
proIndex = str.IndexOf(s[0]);
yearIndex = int.Parse(s[2]);
pageIndex = int.Parse(s[3]);
}
//遍历所有选项卡
SHDocVw.ShellWindows IETabs = new SHDocVw.ShellWindows();
foreach (SHDocVw.InternetExplorer ieTab in IETabs)
{
if (ieTab.LocationURL.Contains("www.landchina.com/default.aspx"))
{
for (int i = proIndex; i <= province.Length; i++)//遍历省
{
for (int y = yearIndex; y <= DateTime.Now.Year; y++)//遍历年
{
//通过js操控
mshtml.HTMLDocument doc = ieTab.Document as mshtml.HTMLDocument;
mshtml.IHTMLScriptElement script = doc.createElement("script") as mshtml.IHTMLScriptElement;//
script.text = string.Format("document.getElementById('TAB_queryTblEnumItem_227').value='{0}';", province[i]);
script.text += string.Format("document.getElementById('TAB_queryTblEnumItem_227_v').value={0};", provinceCode[i]);
script.text += "document.getElementById('TAB_QueryConditionItem227').checked = true;";
script.text += "document.getElementById('TAB_QueryConditionItem268').checked = true;";
DateTime dtbegin = new DateTime(y, 1, 1); ;
DateTime dtend = new DateTime(y, 12, 31);
script.text += string.Format("document.getElementById('TAB_queryDateItem_268_1').value='{0}';", dtbegin);//开始时间
script.text += string.Format("document.getElementById('TAB_queryDateItem_268_2').value='{0}';", dtend);//结束时间
//script.text += "document.getElementById('TAB_QueryConditionItem288').checked = true;";
//script.text += string.Format("document.getElementById('TAB_queryCheckItem_288').value='{0}';","");//土地用途
mshtml.HTMLBody body = doc.body as mshtml.HTMLBody; //取得body对象
body.appendChild((mshtml.IHTMLDOMNode)script);//注册JavaScript
mshtml.IHTMLDocument2 doc2 = (mshtml.IHTMLDocument2)ieTab.Document;
mshtml.IHTMLElementCollection inputs;
inputs = (mshtml.IHTMLElementCollection)doc2.all.tags("INPUT");
mshtml.IHTMLElement element_post = (mshtml.IHTMLElement)inputs.item("TAB_QueryButtonControl", 0);
element_post.click();
doc = ieTab.Document as mshtml.HTMLDocument; analysisSource(doc.body.innerHTML);
Regex re = new Regex("共([0-9]{1,5})页([\\s\\ ]*共[0-9]{1,20})条记录", RegexOptions.Multiline);
Match ma = re.Match(doc.body.innerHTML);
string pages = ma.Groups[1].ToString();
for (int j = pageIndex; j <= int.Parse(pages); j++)
{
TaskProgress = province[i] + "," + provinceCode[i] + "," + dtbegin+","+j;
script = doc.createElement("script") as mshtml.IHTMLScriptElement;//
script.text = string.Format("QueryAction.GoPage('TAB',{0})", j);
body = doc.body as mshtml.HTMLBody; //取得body对象
body.appendChild((mshtml.IHTMLDOMNode)script);//注册JavaScript
//分析页面
analysisSource(doc.body.innerHTML);
Action at1 = new Action(() => { textBox1.Text= ""; });
this.Invoke(at1);
}
pageIndex = 2;
}
yearIndex = 2011;
}
}
else
{
//Action at = new Action(() => { "IE浏览器可能未打开\n"; });
//this.Invoke(at);
}
}
}
catch(Exception ex)
{
Action at = new Action(() => { textBox1.Text += ex.Message.ToString() + "\r\n"; });
this.Invoke(at);
//把错误记录到数据库
LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
LandDB.Model.CJLog model = new LandDB.Model.CJLog();
model.FSourceContent = "appStart:" + ex.Message.ToString();
model.Furl = webBrowser1.Document.Url.ToString();
bll.Add(model);
}
finally
{
Action at = new Action(() => { textBox1.Text += "采集停止;\r\n"; timer1.Enabled = false; });
this.Invoke(at);
//把进度记录到数据库
LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
LandDB.Model.CJLog model = new LandDB.Model.CJLog();
model.TaskProgress = TaskProgress;
model.FSourceContent = "appStart:";
model.Furl = webBrowser1.Document.Url.ToString();
bll.Add(model);
this.EndInvoke(null);
}
}
/// <summary>
/// 获取列表
/// </summary>
/// <param name="source"></param>
private void analysisSource(string source)
{
//去除回车换行符号
source = Regex.Replace(source, "([\\r\\n])[\\s]+", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
source = Regex.Replace(source, "\\n", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
source.Replace("\\r\\n", "");
Regex reg = new Regex("<a([^\\/>href]+)href=\"/DesktopModule/BizframeExtendMdl/workList/bulWorkView.aspx?([^\"]+)\"([^\\/]*)\\/?>",RegexOptions.Multiline|RegexOptions.IgnoreCase);
MatchCollection mats = reg.Matches(source);
foreach (Match mat in mats)
{
string aa = mat.Value;
Regex reg1 = new Regex("href=\"([^\"]+)\"");
string url = "http://www.landchina.com" + reg1.Match(aa).Groups[1].Value.Replace("&", "&");
Action at = new Action(() =>
{
webBrowser1.Navigate(url);
textBox1.Text += url + "\r\n";
});
this.Invoke(at);
Thread.Sleep(3000);
}
}
private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
try
{
string webInfo = webBrowser1.Document.Body.InnerHtml;
if (webInfo != "" && webInfo.IndexOf("占地公告") == -1)
{
//去除回车换行符号
webInfo = Regex.Replace(webInfo, "([\\r\\n])[\\s]+", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
webInfo = Regex.Replace(webInfo, "\\n", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
webInfo.Replace("\\r\\n", "");
webInfo = CleanWordHtml(webInfo);
string Splitstr = @"(<TD class=[""']?cellBordy[""']? vAlign=[""']?top[""']?>).*?(<A (id=[""']?lnkOldBul[""']? class=[""']?link1[""']?>|class=[""']?link1[""']? id=[""']?lnkOldBul[""']?))";
Match cc = Regex.Match(webInfo, Splitstr, RegexOptions.IgnoreCase | RegexOptions.Multiline);
webInfo = cc.Groups[0].ToString();
webInfo = webInfo.ToLower();
if (webInfo != "")
{
getParam(webInfo, webBrowser1.Document.Url.ToString());
}
textBox1.Text += "采集完成\r\n";
labCount.Text = (int.Parse(labCount.Text) + 1).ToString();
}
else
{
textBox1.Text += "空页面\r\n";
//把错误记录到数据库
LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
LandDB.Model.CJLog model = new LandDB.Model.CJLog();
model.Furl = webBrowser1.Document.Url.ToString();
bll.Add(model);
}
}
catch (Exception ex)
{
Action at = new Action(() => { textBox1.Text += ex.Message.ToString() + "\r\n"; });
this.Invoke(at);
//把错误记录到数据库
LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
LandDB.Model.CJLog model = new LandDB.Model.CJLog();
model.FSourceContent = ex.Message.ToString();
model.Furl = webBrowser1.Document.Url.ToString();
bll.Add(model);
}
//finally
//{
// //把进度记录到数据库
// LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
// LandDB.Model.CJLog model = new LandDB.Model.CJLog();
// model.TaskProgress = TaskProgress;
// model.Furl = webBrowser1.Document.Url.ToString();
// bll.Add(model);
//}
}
private void Main_FormClosed(object sender, FormClosedEventArgs e)
{
}
/// <summary>
/// 获取参数
/// </summary>
/// <param name="strWebData"></param>
/// <param name="url"></param>
private void getParam(string strWebData, string url)
{
LandDB.BLL.blockNote bnBll = new LandDB.BLL.blockNote();
LandDB.Model.blockNote bnModel = new LandDB.Model.blockNote();
bnModel.gtUrl = url;
bnModel.dataType = 1;
string Splitstr = "<span[^>]*>([^<]*)</span>";
Match ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
string strTitle = ma.Groups[1].ToString();
bnModel.topic = strTitle;
Splitstr = "[(].*?[)]";
ma = Regex.Match(strTitle, Splitstr);
string noteno = ma.Groups[0].ToString().Replace("(", "").Replace(")", "");
bnModel.noteNo = noteno;
//发布单位
Splitstr = @"<td[^>](?:align=[""']?right[""']?)*>([^<]*)<br>";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
string strpubUnit = ma.Groups[1].Value.Trim();
if (string.IsNullOrEmpty(strpubUnit))
{
strpubUnit = string.Empty;
}
bnModel.pubUnit = strpubUnit;
bool isEx = false;
isEx = bnBll.Exists(string.Format("topic='{0}' and noteNo='{1}' and state=1 ", strTitle, noteno));
if (isEx == true)
{
bnModel = bnBll.GetModel(strTitle, noteno);
//1、采集完成 2、采集不成功,3、地块不完全
if (bnModel.gtState == 2)
{
//重新采集更新
//插入公告
AddblockNote(strWebData, ref bnModel);
bnBll.Update(bnModel);
bool isComp = false;
AddblockInfo(strWebData, bnModel, out isComp);
//判断表是否采集完整
if (isComp == true)
{
bnBll.Update("gtstate=3", bnModel.noteId);
}
}
else if(bnModel.gtState==3)
{
bool isComp = false;
AddblockInfo(strWebData, bnModel, out isComp);
//判断表是否采集完整
if (isComp == true)
{
bnBll.Update("gtstate=3", bnModel.noteId);
}
}
}
if (isEx == false)
{
//插入公告
AddblockNote(strWebData, ref bnModel);
int noteid = bnBll.Add(bnModel);
bnModel.noteId = noteid;
//继续往下插入表
bool isComp=false;
AddblockInfo(strWebData, bnModel,out isComp);
//判断表是否采集完整
if(isComp==true)
{
bnBll.Update("gtstate=3",noteid);
}
}
}
private void AddblockNote(string strWebData, ref LandDB.Model.blockNote bnModel)
{
LandDB.BLL.blockNote bnBll = new LandDB.BLL.blockNote();
//if (bnBll.Exists(bnModel.noteId))
//{
//}
//else
//{
string Splitstr = "";
string province = "";
string city = "";
string blockZone = "";
Match ma = null;
//地区
Splitstr = @"<SPAN[^>](?:id=[""']?lblXzq[""']?)*>([^<]*)</SPAN>";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
string partweb = ma.Groups[1].ToString();
if (partweb != "")
{
partweb = partweb.Replace(">", "|");
string[] s = partweb.Split('|');
if (s.Length > 0)
{
s[0] = s[0].Replace("行政区:", "").Trim();
province = s[0];
//model.province = s[0].Trim().TrimEnd('省').Replace("自治区", "").Replace("直辖市", "").Replace("自治州", "");
if (s.Length >= 2)
{
//s[1] = s[1].ToString().Trim().TrimEnd('市');
//model.city = s[1].Replace("自治区", "").Replace("直辖市", "").Replace("自治州", "").Replace("自治县", "");
city = s[1];
if (s.Length == 3)
{
//s[2] = s[2].ToString().Trim().Replace("自治区", "").Replace("自治州", "").Replace("自治县", "").Replace("本级", "");
//model.blockZone = s[2];
blockZone = s[2];
}
else
{
//model.blockZone = "";
blockZone = "";
}
}
}
}
bnModel.province = province;
bnModel.city = city;
bnModel.blockZone = blockZone;
//出让日期
Splitstr = "(?:号地块:)(.*?日)";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[1].Value;
if (partweb == "")//不存在,则在另一个地方获取
{
Splitstr = "(六、).*?[年]?(</u>)";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[0].ToString();
Splitstr = "<u>.*?日";
ma = Regex.Match(partweb, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[0].ToString().Replace("年", "-").Replace("月", "-").Replace("日", "").Replace("<u>", "").Replace("<U>", "").Trim();
bnModel.transferDate = Convert.ToDateTime(partweb);
}
else
{
partweb = partweb.Replace("<U>", "").Replace("<u>", "").Trim();
bnModel.transferDate = DateTime.Parse(partweb);
}
//出让方式
Splitstr = "(?:以 <u>)(.*?)(?:</u> 方式出让 <u>)";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[1].ToString();
if (partweb != "")
{
bnModel.remiseWay = partweb;
}
//创建时间
bnModel.createTime = DateTime.Parse(DateTime.Now.ToShortDateString().ToString());
//截止日期
Splitstr = "(?:号地块:<u>).*?(?:</u> ;)";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[0].ToString();
if (partweb != "")
{
Splitstr = "(至 <u>).*?(日)";
ma = Regex.Match(partweb, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[0].ToString().Replace("至 <U>", "").Replace("<u>", "");
partweb = Regex.Replace(partweb, "[\u4e00-\u9fa5]", "-").Trim('-');
bnModel.expireDate = DateTime.Parse(partweb);
}
else//拍卖
{
Splitstr = "(截止时间为).*?[年]?(日)";
ma = Regex.Match(strWebData, Splitstr);
partweb = ma.Groups[0].ToString().Replace("年", "-").Replace("月", "-").Replace("日", "").Replace("截止时间为", "").Replace("<u>", "").Replace("<U>", "").Trim();
bnModel.expireDate = DateTime.Parse(partweb);
}
//发布日期
Splitstr = @"<SPAN[^>](id=[""']?lblCreateDate[""']?)*>([^<]*)</SPAN>";
ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
partweb = ma.Groups[0].ToString().Trim();
if (partweb != "")
{
partweb = Regex.Match(partweb, "\\d{4}年\\d{1,2}月\\d{1,2}日").Value;
bnModel.pubDate = DateTime.Parse(partweb);
}
if (string.IsNullOrEmpty(bnModel.province) || string.IsNullOrEmpty(bnModel.city) || string.IsNullOrEmpty(bnModel.topic) || bnModel.pubDate == null || string.IsNullOrEmpty(bnModel.remiseWay) || string.IsNullOrEmpty(bnModel.pubUnit) || bnModel.expireDate == null || bnModel.transferDate == null || string.IsNullOrEmpty(bnModel.blockZone))
{
bnModel.gtState = 2;
}
else
{
bnModel.gtState = 1;
}
//}
}
private void AddblockInfo(string strWebData, LandDB.Model.blockNote bnModel, out bool isComp)
{
strWebData = strWebData.Replace(" ", "");
LandDB.BLL.blockInfo bIBll = new LandDB.BLL.blockInfo();
#region
//获取地块
Regex divRg = new Regex(@"<DIV[\s]*style=[""'\s]*FONT-SIZE:[\s]*12px[""'\s]*>.*?</div>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
MatchCollection divRgs = divRg.Matches(strWebData);
foreach (Match match in divRgs)
{
LandDB.Model.blockInfo bIModel = new LandDB.Model.blockInfo();
string blockTable = match.Value;
if (blockTable != "")
{
bIModel.noteId = bnModel.noteId;
bIModel.province = bnModel.province;
bIModel.city = bnModel.city;
bIModel.blockZone = bnModel.blockZone;
bIModel.transferMode = bnModel.remiseWay;
bIModel.dataType = 1;
bIModel.blockState = 2;
bIModel.state = 1;
bIModel.createTime = DateTime.Now;
//出让单位
bIModel.pubUnit = bnModel.pubUnit;
//起始总价
bIModel.firstPrice = 0;
bIModel.donePrice = 0;
bIModel.doneArea = 0;
bIModel.floorPrice = 0;
bIModel.blockArea = "";
#region
Regex re = new Regex("(<td).*?[>]?(</td>)", RegexOptions.IgnoreCase);
MatchCollection mc = re.Matches(blockTable);
for (int i = 0; i < mc.Count; i++)
{
//var ed = Regex.Match(mc[i].Value, "<td[^>]*>([^<]*)</td>");
var ed = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
if (!string.IsNullOrEmpty(ed))
{
//if(ed.IndexOf("用途")!=-1)
//{
//}
if (ed == "宗地编号:")
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
bIModel.blockNo = tdvalue;
continue;
}
else if (ed == "宗地面积:" || ed == "宗地总面积:")
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
bIModel.blockArea = tdvalue;
continue;
}
else if (ed == "起始价:")
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
var first = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?");
if (first.Success)
{
bIModel.firstPrice = decimal.Parse(first.ToString());
}
continue;
}
else if (ed == "容积率:")
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
string blockRate = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?").Value;
bIModel.blockRate = blockRate;
bIModel.blockRateStr = tdvalue;
continue;
}
else if (ed == "土地用途:" || ed == "土地用途明细:")
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
//string useType = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?").Value;
bIModel.oriUseType = tdvalue;
continue;
}
else if (ed == "宗地坐落:")
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
//string blockAddress = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?").Value;
bIModel.blockAddress = tdvalue;
continue;
}
else if (ed.IndexOf("挂牌截止时间")!=-1)
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
//string expireDate = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?").Value;
bIModel.expireDate = DateTime.Parse(tdvalue);
continue;
}
else if(ed.IndexOf("估价报告备案号")!=-1)
{
i++;
var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", "");
bIModel.RecordNumberOfAppraisalReport = tdvalue;
continue;
}
}
}
#endregion
bool isEx = false;
isEx = bIBll.Exists(string.Format("blockAddress='{0}' and blockNo='{1}' and state=1 ", bIModel.blockAddress, bIModel.blockNo));
if (string.IsNullOrEmpty(bnModel.province) || string.IsNullOrEmpty(bnModel.city) || string.IsNullOrEmpty(bnModel.blockZone) || string.IsNullOrEmpty(bIModel.blockArea) || string.IsNullOrEmpty(bIModel.oriUseType) || bIModel.expireDate == null || string.IsNullOrEmpty(bIModel.pubUnit) || string.IsNullOrEmpty(bIModel.blockAddress) || string.IsNullOrEmpty(bIModel.blockNo) || bIModel.firstPrice == null || string.IsNullOrEmpty(bIModel.blockRateStr) || string.IsNullOrEmpty(bIModel.RecordNumberOfAppraisalReport))
{
bIModel.gtState = 2;
}
else
{
bIModel.gtState = 1;
}
if (isEx == true)
{
bIModel = bIBll.GetModel(bIModel.blockAddress, bIModel.blockNo);
//1、采集完成 2、采集不成功
if (bIModel.gtState == 2)
{
//插入地块
bIBll.Update(bIModel);
}
}
if (isEx == false)
{
int infoid = bIBll.Add(bIModel);
}
}
}
isComp= bIBll.Exists("gtState=2");
#endregion
}
项目源码地址:http://www.onethink.top/1/SoudiWinForm.zip