用正则表达式从网页里面提取视频地址

作者: ScottGu
发布时间:2015-07-01 14:33:31

//用正则表达式从网页里面提取视频地址

//获得一个页面地址,拿到页面html,然后正则表达式去匹配视频地址

//详细的看注释吧。


 1///<summary>
2 /// 网页视频处理类
3 ///</summary>
6 ///<history>
7 ///
8 ///</history>
9 public class WebVideo
10 {
11 ///<summary>
12 /// 优酷、酷6、土豆等视频页面地址
13 ///</summary>
14 private string _pageUrl;
15
16 ///<summary>
17 /// 是否启用页面压缩
18 ///</summary>
19 private bool _isCompressed;
20
21 ///<summary>
22 /// 网站
23 ///</summary>
24 private VideoSite _site;
25
26
27 public WebVideo ()
28 {
29 // TODO: Complete member initialization
30 }
31
32
33 ///<summary>
34 /// 实例化WebVideo类
35 ///</summary>
36 ///<param name="pageUrl">视频页面地址</param>
37 ///<param name="isCompressed">获取页面时是否启用压缩</param>
38 public WebVideo ( string pageUrl, bool isCompressed )
39 {
40 // TODO: Complete member initialization
41 this._pageUrl = pageUrl.Trim();
42 this._isCompressed = isCompressed;
43 this._site = this.GetSite(_pageUrl);
44 }
45
46
47 ///<summary>
48 /// 根据Url地址得到网页的html源码
49 /// (使用gzip,deflate压缩,延迟低)
50 ///</summary>
51 ///<param name="Url"></param>
52 ///<returns></returns>
53 public string GetWebContent ( string Url )
54 {
55 string strResult = "";
56 try
57 {
58 Stream decompressedStream = null;
59 //声明一个HttpWebRequest请求
60 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
61 request.Accept = "*/*";
62 request.Headers.Set("Pragma", "no-cache");
63 //设置连接超时时间
64 request.Timeout = 9000;
65 request.UserAgent = "TaoCaiSpider1.0 Kevin-Gu's spider";
66 request.Headers.Add("Accept-Encoding", "gzip,deflate");
67
68 HttpWebResponse response = (HttpWebResponse)request.GetResponse();
69
70 string compressMode = response.ContentEncoding.ToLower();
71 Console.WriteLine(compressMode);
72
73 if (compressMode == "gzip")
74 {
75 decompressedStream
76 = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress);
77 }
78 else if (compressMode == "deflate")
79 {
80 decompressedStream
81 = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress);
82 }
83 else
84 {
85 // 貌似只有优酷启用了页面压缩。。。
86 decompressedStream = response.GetResponseStream();
87 }
88
89 Encoding encode = Encoding.GetEncoding(response.CharacterSet);
90 using (StreamReader streamReader = new StreamReader(decompressedStream, encode))
91 {
92 strResult = streamReader.ReadToEnd();
93 }
94 }
95 catch (Exception ex)
96 {
97 Console.WriteLine("error occored:" + ex.Message);
98 }
99 return strResult;
100 }
101
134
135
167
168
169 ///<summary>
170 /// 使用正则表达式匹配获取视频文件地址
171 ///</summary>
172 ///<param name="htmlContent"></param>
173 ///<returns></returns>
174 public string GetVideoFileUrl (string htmlContent)
175 {
176 string[] rgxArr =new string[]{
177 @"http://player\.youku\.com/player\.php/sid/[\w]{13}/v\.swf", //优酷的文件地址正则
178 @"http://player\.ku6\.com/refer/[\w]{16}/v\.swf",
179 @"http://js\.tudouui\.com/bin/player_online/[\w]+\.swf"
180 };
193
194 Regex rgx;
195
196 // 使用不同的正则表达式来匹配视频文件地址
197 switch (_site)
198 {
199 case VideoSite.YouKu:
200 rgx = new Regex(rgxArr[0]);
201 if (rgx.IsMatch(htmlContent))
202 {
203 return rgx.Match(htmlContent).ToString();
204 }
205 break;
206 case VideoSite.TuDou:
207 rgx = new Regex(rgxArr[2]);
208 if (rgx.IsMatch(htmlContent))
209 {
210 return rgx.Match(htmlContent).ToString();
211 }
212 break;
213 case VideoSite.Ku6:
214 rgx = new Regex(rgxArr[1]);
215 if (rgx.IsMatch(htmlContent))
216 {
217 return rgx.Match(htmlContent).ToString();
218 }
219 break;
220 default:
221 break;
222 }
223
224 return string.Empty;
225 }
226
227
228 ///<summary>
229 /// 获得视频网页中视频文件地址
230 ///</summary>
231 ///<returns></returns>
232 public string GetVideoUrl ()
233 {
234 string videoUrl = string.Empty;
235 if (_isCompressed)
236 {
237 string html = this.GetWebContent(_pageUrl);
238 videoUrl = this.GetVideoFileUrl(html);
239 }
240 else
241 {
242 string html = this.GetHtmlWithoutCompress(_pageUrl);
243 videoUrl = this.GetVideoFileUrl(html);
244 }
245 return videoUrl;
246 }
247
248
258
259 }//end class
260
261
262 ///<summary>
263 /// 视频网站枚举
264 ///</summary>
265 public enum VideoSite
266 {
267 YouKu=0,
268 Ku6=1,
269 TuDou=2,
270 };
  
  
  

标签: 正则表达式
来源:http://www.cnblogs.com/scottgu/archive/2011/10/31/2230707.ht

推荐: