爬虫中多线程和断点续传的实现-CFANZ编程社区

怎么获取下载链接和构造Request 请求不是本文的重点，本文重点是获取下载链接后怎么提高下载速度以及断电续传功能的实现思路。
获取下载链接后可以从Response 中获得所需文档的大小，即contentlength 的值，而Request 中range 属性可以设置下载的范围，知道了这个原理，多线程和断点续传就不难实现了。笔者的一段实现多线程下载的代码如下：

     /// <summary>
        ///  with parm from and to control the start and end download postion of the stream which can download the stream with multithread and even
        ///  implement the continious download after breaking.
        /// </summary>
        /// <param name="url"></param>
        /// <param name="from"> the start download position of the stream</param>
        /// <param name="to">The end download position of the stream</param>
        /// <returns></returns>

        public static HttpWebRequest getBlblHttpReq3(String url, long from, long to)
        {

            string headers = $@"GET /upgcxcode/49/91/252239149/252239149-1-416.mp4?e=ig8euxZM2rNcNbh17WdVhoMzhWUVhwdEto8g5X10ugNcXBlqNxHxNEVE5XREto8KqJZHUa6m5J0SqE85tZvEuENvNo8g2ENvNo8i8o859r1qXg8xNEVE5XREto8GuFGv2U7SuxI72X6fTr859r1qXg8gNEVE5XREto8z5JZC2X2gkX5L5F1eTX1jkXlsTXHeux_f2o859IB_&uipk=5&nbs=1&deadline=1608147118&gen=playurl&os=bcache&oi=1898897508&trid=e78bffb50fe34dbcbfa1a64f94ef5e5dp&platform=pc&upsig=2c86c8b3551699a6761bf640d27261d3&uparams=e,uipk,nbs,deadline,gen,os,oi,trid,platform&cdnid=3230&mid=598717778&orderid=0,3&agrr=0&logo=80000000 HTTP/1.1
Host: cn-bj3-cc-bcache-11.bilivideo.com
Connection: keep-alive
User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36
Accept-Encoding: identity;q=1, *;q=0
Accept: */*
Sec-Fetch-Site: cross-site
Sec-Fetch-Mode: no-cors
Sec-Fetch-Dest: video
Referer: https://www.bilibili.com/
Accept-Language: zh-CN,zh;q=0.9
Range: bytes=0-
";
            String pattern = "(GET (?<url>.*)\n)(Host:(?<host>.*))\n(Connection:(?<Connection>.*))\n(User-Agent: (?<User-Agent>.*))\n(Accept:(?<Accept>.*))\n(Origin:(?<Origin>.*))\n(Sec-Fetch-Site: (?<Sec-Fetch-Site>.*))\n(Sec-Fetch-Mode: (?<Sec-Fetch-Mode>.*))\n(Sec-Fetch-Dest: (?<Sec-Fetch-Dest>.*))\n(Referer: (?<Referer>.*))\n(Accept-Encoding: (?<Accept-Encoding>.*)\n(Accept-Language: (?<Accept-Language>.*))\n(Range: (?<Range>.*))\n(If-Range: (?<If-Range>.*))";
            pattern = "(GET (?<url>.*)\n)(Host:(?<host>.*))\n(Connection:(?<Connection>.*))\n(User-Agent: (?<User-Agent>.*))\n(Accept:(?<Accept>.*))\n(Origin:(?<Origin>.*))\n(Sec-Fetch-Site: (?<Sec-Fetch-Site>.*))\n(Sec-Fetch-Mode: (?<Sec-Fetch-Mode>.*))\n(Sec-Fetch-Dest: (?<Sec-Fetch-Dest>.*))\n(Referer: (?<Referer>.*))\n(Accept-Encoding: (?<Accept-Encoding>.*)\n(Accept-Language: (?<Accept-Language>.*))\n(Range: (?<Range>.*))";
            var web = new HtmlWeb();
            //var urlPtn = @"([GET|OPTIONS]\s(?<url>.*logo=[0-9]*))";
            var hostPtn = @"(Host:\s(?<host>.*)\r)";//for the headers parser 
            //get host from url 
            var hostptn2 = @"(https://(?<host>.+?))/";//for the param url  parser 
            Match match = Regex.Match(url, hostPtn, RegexOptions.IgnoreCase);

            var httpwebRequest = (HttpWebRequest)WebRequest.Create(url); // requests.get()

            try
            {
                // The two key item , host can be obtained from the last information and Referer have to obtained from fiddler
                match = Regex.Match(url, hostptn2, RegexOptions.IgnoreCase);
                httpwebRequest.Host = match.Groups["host"].Value;

                pattern = "(Range: (?<Range>.*))\r";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.AddRange(from, to);

                pattern = "(Referer: (?<Referer>.*))\r";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.Referer = match.Groups["Referer"].Value;


                pattern = "(Origin:(?<Origin>.*))\r";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.Headers["origin"] = match.Groups["Origin"].Value;

                httpwebRequest.Method = "Get";

                //httpwebRequest.ContentType = match.Groups["ContentType"].Value;
                //httpwebRequest.ContentLength=

                pattern = "(Accept:(?<Accept>.*)\r)";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.Accept = match.Groups["Accept"].Value; ;

                pattern = "(Accept-Encoding:(?<AcceptEncoding>.*)\r)";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.Headers["Accept-Encoding"] = match.Groups["AcceptEncoding"].Value;

                pattern = "(Accept-Language:(?<AcceptLanguage>.*)\r)";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.Headers["Accept-Language"] = match.Groups["AcceptLanguage"].Value;

                pattern = "(User-Agent:(?<UserAgent>.*)\r)";
                match = Regex.Match(headers, pattern, RegexOptions.IgnoreCase);
                httpwebRequest.UserAgent = match.Groups["UserAgent"].Value;


                httpwebRequest.Headers["Upgrade-Insecure-Requests"] = "1";
                httpwebRequest.KeepAlive = true;
                return httpwebRequest;

            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.Message);
            }
            return null;

        }

其中的header 部分，是通过使用fiddler 软件跟踪分析获得，这个是下载的关键，但是由于各个网站的防盗技术，使得这个部分需要自己分析。（当然，如果有做浏览器的能力，这个也就简单了）
上述函数的调用方法

           index = 0;
            threadTll = validUrl.Count;
            partLen = totalLength / threadTll;

            String threadFile = "";
            List<DownLoadThread> dldThredLst = new List<DownLoadThread>();
            foreach (string url in validUrl)
            {

                //range = String.Format("{0}-{1}", partLen * index, partLen * (index + 1));
                threadFile = string.Format("{0}_{1}.{2}", fileName, index, fileType);

                //create a new thread 
                if (index + 1 < threadTll)
                {
                    httpRequest = myFileDownLoader.getBlblHttpReq3(url, partLen * index, partLen * (index + 1) - 1);
                }
                else
                {
                    httpRequest = myFileDownLoader.getBlblHttpReq3(url, partLen * index);

                }

                DownLoadThread dldThread = new DownLoadThread(httpRequest, threadFile,index, bVideo);
                dldThredLst.Add(dldThread);
                Thread httpRqstThread = new Thread(new ThreadStart(dldThread.saveStreamToFile));
                //Thread httpRqstThread = new Thread(new ThreadStart(dldThread.saveStreamToFileTest));
                //if (index == 3|| index==4)
                
                httpRqstThread.Start();
                


                //var streamReader = new StreamReader(httpwebResponse.GetResponseStream());
                fileLst.Add(threadFile);

                index++;
            }

函数解析：
第一步获得所有链接，这一步的原因是和具体的网站有关。笔者下载的是B站视频，该视频有多个下载地址，就可以利用多个下载地址对应不同的现成下载不同的部分，然后合成。同一个下载地址也可以使用这个技术，但是对于B站而言好像速度不能加快（可能是进行了限制）。根据线程数，计算每个分块的起止位置，在分别调用各个线程，这样多线程就实现了。
断点续传的实现
计算获得的stream的大小，然后计算新的起点位置，将其保存到注册表或者文件中，这样下次下载时，就可以使用这个值作为新的下载起点值。当然还要考虑很多其他问题，如下载文件的保存等等。
等有时间了，做一个断点续传的例子。
maraSun 于 BJFWDQ
2022-02-27