虫言虫语 虫言虫语

Python编写单线程爬虫

in Pythonread (45912) 文章转载请注明来源!
    #!/usr/bin/env/ python3
import re
import urllib.request
import os

def picurl(url,path):
    if os.path.exists(path):
        print(path + 'exist')
    else:
        os.makedirs(path)
    while True:
        html = loadurl(url)
        if html == '':
            print('load' + url + 'error')
            continue
        else:
            break
    pic_list(re_pic(url),path)

def save_pic(url,path):
    searchname = '.*/(.*?.jpg)|(.*?.gif)'
    name = re.findall(searchname,url)
    filename = path + "/" + name[0][0]
    print(filename + ':start')
    tryTimes = 3
    while tryTimes != 0:
        tryTimes -= 1
        if os.path.exists(filename):
            print(filename + 'exists,skip')
            return True
        else:
            print(filename)
            open(filename,'wb')
        if download(url,filename):
            break
    if tryTimes != 0:
        print(filename + ': Over')
    else:
        print(url + ': Failed to download')

def download(url,filename):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
    try:
        conn = urllib.request.urlopen(req,timeout=5)
        with open(filename,'wb') as f:
            f.write(conn.read())
            f.close()
        return True
    except urllib.error.URLError:
        print('load' + url + 'error')
        return False
    except Exception:
        print('unkown exception in conn.read()')
        return ''
def pic_list(picList,path):
    a = ' '.join(picList)
    b = re.findall('(\w+://[^\s]+)',a)
    for picurl in b:
        save_pic(picurl,path)

def re_pic(url):
    name_third = []
    name_fourth = []
    searchname = '<img alt=\"([^\"]*)\"( class="scrollLoading")?(  style="width:100%;")?\s*src=\"(\w+://[^\s]+)\"( title=\"([^\"]*)\")? />'
    name = re.findall(searchname,loadurl(url))
    for i in range(len(name)):
        name_first = name[i]
        for e in range(len(name_first)):
            name_second = name_first[e]
            if 'class' in name_second or 'style' in name_second or 'title' in name_second:
                continue
            else:
                name_third.append(name_second)
    name_fourth = sorted(set(name_third),key=name_third.index)
    return(name_fourth)

def loadurl(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
    try:
        conn = urllib.request.urlopen(req)
        html = conn.read()
        html = html.decode('GBK')
        return html
    except urllib.error.URLError:
        return ''
    except Exception:
        print('unkown exception in conn.read()')
        return ''

if __name__ == '__main__':
    url = 'http://www.meizitu.com/a/454.html'
    picurl(url,'D:/WW')
jrotty WeChat Pay

微信打赏

jrotty Alipay

支付宝打赏

文章二维码

扫描二维码,在手机上阅读!

发表新评论
已有 11851 条评论
  1. cialis coupon

    Cazaqt apxfxs where can i buy clomid purchase clomid

  2. canadian online pharmacy cialis

    Cmajfu dmyfml lasix generic name lasix 20 mg

  3. online cialis

    Kyzqej yobdks amoxicillin pneumonia amoxicillin sales worldwide

  4. cialis prices

    Iftoqb shxokv lasix uses lasix 40mg

  5. buying cialis online safely

    Pwsvnk uetvau azithromycin 250 azithromycin z pack

  6. cialis tadalafil

    Htltaj irwkxh Intagra buy Avana

  7. buying viagra online

    Gpyxpc fbetch buy clomid where can i buy clomiphene

  8. LerDroryWepe
    LerDroryWepe
    回复

    canadian parmacy viagra 100mg

  9. purchasing cialis online

    Kmtewb ritjfh where can i buy clomiphene buy clomid online

  10. JaneDes
    JaneDes
    回复

    baclofen online canada

  11. Kozychobete
    Kozychobete
    回复

    buy viagra paraguayhttp://viagrayqdd.com/ - buy viagraviagra online

  12. Kozychobete
    Kozychobete
    回复

    russia buy viagrahttp://viagrayqdd.com/ - viagra online pharmacybuy viagra

  13. Zhannaobete
    Zhannaobete
    回复

    cialis with discounthttp://cialisyytr.com/ - cialisbuy cialis onlineuso cialis genericohttp://ciaviagogogo.com/ - cheap cialisonline cialischeap generic cialis canadahttp://cilapharm.com/ - buy cialis onlinecialis onlineaffordable custom letter writinghttps://edpillsnews20.com/ - generic viagraviagra cheap

  14. Zhannaobete
    Zhannaobete
    回复

    buy generic female cialishttp://cialisyytr.com/ - generic cialis onlinebuy generic cialisbest online store for cialishttp://ciaviagogogo.com/ - cheap cialischeap cialisclick now cialiscomhttp://cilapharm.com/ - cheap cialis onlineonline cialisprofessional report writing serviceshttps://edpillsnews20.com/ - viagra onlineviagra generic

  15. BeardenSwanobete

    bulk cialishttps://ciasale20.com/ - cialis
    eu buy cialisachat en ligne cialis 5mg

  16. cialis cialis

    Maaioo qzrrmw generic clomiphene where can i buy clomiphene

  17. BeardenSwanobete

    fda approved cialis generichttps://ciasale20.com/ - cialis curativo
    only now cialis softcialis super pill

  18. BeardenSwanobete

    we choice cialis to orderhttps://ciasale20.com/ - ou acheter du cialis moins che
    cialisi use it cialis sale

  19. BeardenSwanobete

    ou acheter du cialis moins chehttps://ciasale20.com/ - cheap cialis online
    cialis online pharmacy canadacomprar cialis generico 5mg

  20. CallicuttVidhyaobete

    viagra fai da tehttps://deliveryviagranow.com/ - viagra generic
    viagra online sicherer verkauflook here indian viagra

博客已萌萌哒运行
© 2020 由 Typecho 强力驱动.Theme by Yodu
前篇 后篇
雷姆
拉姆