import requestsimport reimport timeimport hashlibdef get_page(url): print('GET %s' %url) try: response=requests.get(url) if response.status_code == 200: return response.content except Exception: passdef parse_index(res): obj=re.compile('class="items.*? 0: movie_url=res[0] return movie_urldef save(movie_url): response=requests.get(movie_url,stream=False) if response.status_code == 200: m=hashlib.md5() m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8')) filename=m.hexdigest() with open(r'./movies/%s.mp4' %filename,'wb') as f: f.write(response.content) f.flush()def main(): index_url='http://www.xiaohuar.com/list-3-{0}.html' for i in range(5): print('*'*50,i) #爬取主页面 index_page=get_page(index_url.format(i,)) #解析主页面,拿到视频所在的地址列表 detail_urls=parse_index(index_page) #循环爬取视频页 for detail_url in detail_urls: #爬取视频页 detail_page=get_page(detail_url) #拿到视频的url movie_url=parse_detail(detail_page) if movie_url: #保存视频 save(movie_url)if __name__ == '__main__': main()#并发爬取from concurrent.futures import ThreadPoolExecutorimport queueimport requestsimport reimport timeimport hashlibfrom threading import current_threadp=ThreadPoolExecutor(50)def get_page(url): print('%s GET %s' %(current_thread().getName(),url)) try: response=requests.get(url) if response.status_code == 200: return response.content except Exception as e: print(e)def parse_index(res): print('%s parse index ' %current_thread().getName()) res=res.result() obj=re.compile('class="items.*? 0: movie_url=res[0] print('MOVIE_URL: ',movie_url) with open('db.txt','a') as f: f.write('%s\n' %movie_url) # save(movie_url) p.submit(save,movie_url) print('%s下载任务已经提交' %movie_url)def save(movie_url): print('%s SAVE: %s' %(current_thread().getName(),movie_url)) try: response=requests.get(movie_url,stream=False) if response.status_code == 200: m=hashlib.md5() m.update(('%s%s.mp4' %(movie_url,time.time())).encode('utf-8')) filename=m.hexdigest() with open(r'./movies/%s.mp4' %filename,'wb') as f: f.write(response.content) f.flush() except Exception as e: print(e)def main(): index_url='http://www.xiaohuar.com/list-3-{0}.html' for i in range(5): p.submit(get_page,index_url.format(i,)).add_done_callback(parse_index)if __name__ == '__main__': main()爬取校花网视频