Python编写单线程爬虫
#!/usr/bin/env/ python3
import re
import urllib.request
import os
def picurl(url,path):
if os.path.exists(path):
print(path + 'exist')
else:
os.makedirs(path)
while True:
html = loadurl(url)
if html == '':
print('load' + url + 'error')
continue
else:
break
pic_list(re_pic(url),path)
def save_pic(url,path):
searchname = '.*/(.*?.jpg)|(.*?.gif)'
name = re.findall(searchname,url)
filename = path + "/" + name[0][0]
print(filename + ':start')
tryTimes = 3
while tryTimes != 0:
tryTimes -= 1
if os.path.exists(filename):
print(filename + 'exists,skip')
return True
else:
print(filename)
open(filename,'wb')
if download(url,filename):
break
if tryTimes != 0:
print(filename + ': Over')
else:
print(url + ': Failed to download')
def download(url,filename):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
try:
conn = urllib.request.urlopen(req,timeout=5)
with open(filename,'wb') as f:
f.write(conn.read())
f.close()
return True
except urllib.error.URLError:
print('load' + url + 'error')
return False
except Exception:
print('unkown exception in conn.read()')
return ''
def pic_list(picList,path):
a = ' '.join(picList)
b = re.findall('(\w+://[^\s]+)',a)
for picurl in b:
save_pic(picurl,path)
def re_pic(url):
name_third = []
name_fourth = []
searchname = '<img alt=\"([^\"]*)\"( class="scrollLoading")?( style="width:100%;")?\s*src=\"(\w+://[^\s]+)\"( title=\"([^\"]*)\")? />'
name = re.findall(searchname,loadurl(url))
for i in range(len(name)):
name_first = name[i]
for e in range(len(name_first)):
name_second = name_first[e]
if 'class' in name_second or 'style' in name_second or 'title' in name_second:
continue
else:
name_third.append(name_second)
name_fourth = sorted(set(name_third),key=name_third.index)
return(name_fourth)
def loadurl(url):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
try:
conn = urllib.request.urlopen(req)
html = conn.read()
html = html.decode('GBK')
return html
except urllib.error.URLError:
return ''
except Exception:
print('unkown exception in conn.read()')
return ''
if __name__ == '__main__':
url = 'http://www.meizitu.com/a/454.html'
picurl(url,'D:/WW')

微信打赏
支付宝打赏
扫描二维码,在手机上阅读!
赏