软讯网络 > 编程语言 > .NET > C#.NET > 抓取网页信息,并用正则表达式分析后得到信息。
【标 题】:抓取网页信息,并用正则表达式分析后得到信息。
【关键字】:
【来 源】:http://blog.csdn.net/yck263/archive/2006/12/20/1450095.aspx
抓取网页信息,并用正则表达式分析后得到信息。
Imports System.Xml
Imports System.Text.RegularExpressions

Public Class Form1Class Form1


Private Sub Button1_Click()Sub Button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button1.Click

Try
Dim objXml As New XmlDataDocument

Dim objXmlElementCity As New XmlDataDocument
Dim Content As String = ""

objXml.LoadXml("<root />")
'抓取所有城市列表

Content = GetContent("http://localhost/CMA/index.htm", "gb2312")

'抓到内容后,?始分析数据

Dim regex As System.Text.RegularExpressions.Regex
Dim mc As System.Text.RegularExpressions.Match


Dim objXmlCityList As XmlElement
Dim partten As String = ""
Dim cityUrl As String = ""
Dim parttenCity As String = ""

Dim i As Long = 1
Dim j As Long = 1

'Dim mcCity As System.Text.RegularExpressions.Match

objXmlCityList = objXml.CreateElement("citylist")
Dim s As String
s = "2005-2-21"
'Regex reg = new Regex(@"(?<y>d{4})-(?<m>d{1,2})-(?<d>d{1,2})",RegexOptions.Compiled);
'Match match = reg.Match(s);
'int year = int.Parse(match.Groups["y"].Value);
'int month = int.Parse(match.Groups["m"].Value);
'int day = int .Parse(match.Groups["d"].Value);
'DateTime time = new DateTime(year,month,day);
'Console.WriteLine(time);
'Console.ReadLine();


'partten = "^c[1] = new Array(" + """" + "6" + ";"


' c[1] = new Array("
'partten = "" + "(?<citycode>[0-9]{5,})" + """"
'partten = "(?<citycode>[0-9]{5,})"
'<citycode>
'partten = "c[[0-9]{1,2}].*;" ' = new Array" + ".*" + ");"
'partten = "^[0-9]{2,3}.[0-9]{2,3}.[0-9]{2,3}.[0-9]{2,3}"

partten = "(?<citycode>" & """" & "[0-9]{5}" & """" & ")"
regex = New System.Text.RegularExpressions.Regex(partten, RegexOptions.Compiled Or RegexOptions.IgnoreCase)
objXmlCityList.SetAttribute("vdatetime", DateTime.Now.ToShortDateString())

mc = regex.Match(Content, 0)
MsgBox(mc.Groups("citycode").Value)

'c.Groups(
'Do While mc.Success

' 'MsgBox(mc.Groups("citycode").Value)
' MsgBox(mc.ToString)
' 'MsgBox(mc.Index)

' 'objXmlCityList.AppendChild(
' mc = regex.Match(Content, mc.Index + mc.Length)
' mc.NextMatch()
'Loop



Catch ex As Exception

End Try



End Sub



Private Function GetContent()Function GetContent(ByVal url As String, ByVal encoding As String) As String
Dim str As String = ""
Dim client As New Net.WebClient
client.Headers.Add("Accept", "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*")
client.Headers.Add("Accept-Language", "zh-cn")
client.Headers.Add("UA-CPU", "x86")
client.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)")
Try
Dim buffer As Byte()
buffer = client.DownloadData(url)
If (encoding = "utf-8") Then
str = System.Text.Encoding.GetEncoding("utf-8").GetString(buffer, 0, buffer.Length)
Else
str = System.Text.Encoding.GetEncoding("gb2312").GetString(buffer, 0, buffer.Length)
End If
Return str

Return str
Catch ex As Exception
Return ""
End Try
End Function

End Class

【相关文章】
没有相关文章