虫言虫语 虫言虫语

Python编写单线程爬虫

in Pythonread (19015) 文章转载请注明来源!
    #!/usr/bin/env/ python3
import re
import urllib.request
import os

def picurl(url,path):
    if os.path.exists(path):
        print(path + 'exist')
    else:
        os.makedirs(path)
    while True:
        html = loadurl(url)
        if html == '':
            print('load' + url + 'error')
            continue
        else:
            break
    pic_list(re_pic(url),path)

def save_pic(url,path):
    searchname = '.*/(.*?.jpg)|(.*?.gif)'
    name = re.findall(searchname,url)
    filename = path + "/" + name[0][0]
    print(filename + ':start')
    tryTimes = 3
    while tryTimes != 0:
        tryTimes -= 1
        if os.path.exists(filename):
            print(filename + 'exists,skip')
            return True
        else:
            print(filename)
            open(filename,'wb')
        if download(url,filename):
            break
    if tryTimes != 0:
        print(filename + ': Over')
    else:
        print(url + ': Failed to download')

def download(url,filename):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
    try:
        conn = urllib.request.urlopen(req,timeout=5)
        with open(filename,'wb') as f:
            f.write(conn.read())
            f.close()
        return True
    except urllib.error.URLError:
        print('load' + url + 'error')
        return False
    except Exception:
        print('unkown exception in conn.read()')
        return ''
def pic_list(picList,path):
    a = ' '.join(picList)
    b = re.findall('(\w+://[^\s]+)',a)
    for picurl in b:
        save_pic(picurl,path)

def re_pic(url):
    name_third = []
    name_fourth = []
    searchname = '<img alt=\"([^\"]*)\"( class="scrollLoading")?(  style="width:100%;")?\s*src=\"(\w+://[^\s]+)\"( title=\"([^\"]*)\")? />'
    name = re.findall(searchname,loadurl(url))
    for i in range(len(name)):
        name_first = name[i]
        for e in range(len(name_first)):
            name_second = name_first[e]
            if 'class' in name_second or 'style' in name_second or 'title' in name_second:
                continue
            else:
                name_third.append(name_second)
    name_fourth = sorted(set(name_third),key=name_third.index)
    return(name_fourth)

def loadurl(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
    try:
        conn = urllib.request.urlopen(req)
        html = conn.read()
        html = html.decode('GBK')
        return html
    except urllib.error.URLError:
        return ''
    except Exception:
        print('unkown exception in conn.read()')
        return ''

if __name__ == '__main__':
    url = 'http://www.meizitu.com/a/454.html'
    picurl(url,'D:/WW')
jrotty WeChat Pay

微信打赏

jrotty Alipay

支付宝打赏

文章二维码

扫描二维码,在手机上阅读!

发表新评论
已有 6283 条评论
  1. gwbqabaxy
    gwbqabaxy
    回复

    viagra https://zvigariageneric.com/ - generic viagra viagra without a doctor prescription generic viagra viagra

  2. dbniLiene
    dbniLiene
    回复

    cialis https://valcialisns.com/ - cialis п»їcialis п»їcialis buy cialis

  3. mejzCrarl
    mejzCrarl
    回复

    cialis online http://pharmcilisa.com/ - generic cialis п»їcialis online cialis buy cialis

  4. ncpnwbleme
    ncpnwbleme
    回复

    testosterone function best testosterone boosters low testosterone testosterone patches for men que es testosteronasymptoms of low testosterone symptoms of low testosterone testosterone injections free testosterone how to increase testosteronehttps://pharm-usa-official.com/testosterone.php - Sustanon 350

  5. gwbqabaxy
    gwbqabaxy
    回复

    viagra https://zvigariageneric.com/ - viagra without a doctor prescription cheap viagra generic viagra generic viagra

  6. smagmevagype
    smagmevagype
    回复

    cbd cbd pills cbd oil for dogs cbd online

  7. crapyragacoup

    http://cbdoilstore.us.com/ cbd near me http://cbdoilstore.us.com/ - cbd oil for dogs cbd for dogs

  8. smagmevagype
    smagmevagype
    回复

    cbd pills http://cbd-online.us.com/ - best cbd oil buy cbd oil cbd capsules buy hemp oil best cbd oil buy cbd oil

  9. swimarumTauff

    cbd drops http://cbd-oils.us.com/ - cbd oil for pain cbd oil cbd medic cbd online

  10. crapyragacoup

    cbd products http://cbdoilstore.us.com/ - cbd hemp cbd oil cbd oil for sale

  11. swimarumTauff

    cbd tinctures http://cbd-oils.us.com/ - hemp oil for pain hemp oil cbd medic

  12. smagmevagype

    cbd capsules http://cbd-online.us.com/ - cbd oil for pain best cbd oil cbd oil for dogs

  13. vsbhabaxy
    vsbhabaxy
    回复

    viagra https://genericjojos.com/ - viagra online viagra without doctor prescription cheap viagra viagra

  14. crapyragacoup

    cbd oil cbd for sale cannabis oil cbd oil for dogs http://cbdoilstore.us.com/ - cbd hemp

  15. swimarumTauff

    http://cbd-oils.us.com/ best cbd oil buy cbd oil cbd vape hemp cbd hemp cbd oil

  16. gwbqabaxy
    gwbqabaxy
    回复

    viagra https://zvigariageneric.com/ - generic viagra viagra viagra viagra without doctor prescription

  17. Amitsflormror

    cbd oil benefits http://cbd-hemp.us.com/ - cbd hemp hemp oil for pain cbd for dogs

  18. grlsetelf
    grlsetelf
    回复

    viagra online https://medyvirgaraonline.com/ - cheap viagra viagra generic viagra generic viagra

  19. gwbqabaxy
    gwbqabaxy
    回复

    viagra https://zvigariageneric.com/ - viagra without doctor prescription viagra without a doctor prescription viagra online viagra

  20. Amitsflormror

    cbd vape cbd cream cbd store

博客已萌萌哒运行
© 2020 由 Typecho 强力驱动.Theme by Yodu
前篇 后篇
雷姆
拉姆