抓取网页中的链接 - 中国WEB开发者网络 (http://www.webasp.net) -- 技术教程 (http://www.webasp.net/article/) --- 抓取网页中的链接 (http://www.webasp.net/article/27/26993.htm) |
| -- 作者:未知 -- 发布日期: 2005-10-27 |
输入一个地址,就可以把那个网页中的链接提取出来,下面这段代码可以轻松实现,主要的是用到了正则表达式。 GetUrl.aspx代码如下: 后代码GetUrl.aspx.vb如下: 从 " & urlTextBox.Text & "分离出的Href链接" & _ "找到并整理" & countOfMatches.ToString() & " 个链接" & _ report.ToString().Replace(Environment.NewLine, " ") TipResult.Text &= " 整理过的页面" resultLabel.Text = newWebPage End Sub Public Function MatchHandler(ByVal m As Match) As String Dim link As String = m.Groups("foundAnchor").Value Dim rToL As New Regex("^", RegexOptions.Multiline Or RegexOptions.RightToLeft) Dim col, row As Int32 Dim lineBegin As Int32 = rToL.Match(webPage, m.Index).Index row = rToL.Matches(webPage, m.Index).Count col = m.Index - lineBegin report.AppendFormat( _ "Link {0}, fixed at row: {1}, col: {2}{3}", _ Server.HtmlEncode(m.Groups(0).Value), _ row, _ col, _ Environment.NewLine _ ) Dim newLink As String If link.StartsWith("/") Then newLink = link.Substring(1) Else newLink = link End If countOfMatches += 1 Return m.Groups(0).Value.Replace(link, newLink) End Function Private Function GrabUrl() As String Dim wc As New WebClient() Dim s As Stream = wc.OpenRead(urlTextBox.Text) Dim sr As StreamReader = New StreamReader(s, System.Text.Encoding.Default) GrabUrl = sr.ReadToEnd s.Close() wc.Dispose() End Function End Class |
| webasp.net |