虫言虫语 虫言虫语

Python编写单线程爬虫

in Pythonread (19019) 文章转载请注明来源!
    #!/usr/bin/env/ python3
import re
import urllib.request
import os

def picurl(url,path):
    if os.path.exists(path):
        print(path + 'exist')
    else:
        os.makedirs(path)
    while True:
        html = loadurl(url)
        if html == '':
            print('load' + url + 'error')
            continue
        else:
            break
    pic_list(re_pic(url),path)

def save_pic(url,path):
    searchname = '.*/(.*?.jpg)|(.*?.gif)'
    name = re.findall(searchname,url)
    filename = path + "/" + name[0][0]
    print(filename + ':start')
    tryTimes = 3
    while tryTimes != 0:
        tryTimes -= 1
        if os.path.exists(filename):
            print(filename + 'exists,skip')
            return True
        else:
            print(filename)
            open(filename,'wb')
        if download(url,filename):
            break
    if tryTimes != 0:
        print(filename + ': Over')
    else:
        print(url + ': Failed to download')

def download(url,filename):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
    try:
        conn = urllib.request.urlopen(req,timeout=5)
        with open(filename,'wb') as f:
            f.write(conn.read())
            f.close()
        return True
    except urllib.error.URLError:
        print('load' + url + 'error')
        return False
    except Exception:
        print('unkown exception in conn.read()')
        return ''
def pic_list(picList,path):
    a = ' '.join(picList)
    b = re.findall('(\w+://[^\s]+)',a)
    for picurl in b:
        save_pic(picurl,path)

def re_pic(url):
    name_third = []
    name_fourth = []
    searchname = '<img alt=\"([^\"]*)\"( class="scrollLoading")?(  style="width:100%;")?\s*src=\"(\w+://[^\s]+)\"( title=\"([^\"]*)\")? />'
    name = re.findall(searchname,loadurl(url))
    for i in range(len(name)):
        name_first = name[i]
        for e in range(len(name_first)):
            name_second = name_first[e]
            if 'class' in name_second or 'style' in name_second or 'title' in name_second:
                continue
            else:
                name_third.append(name_second)
    name_fourth = sorted(set(name_third),key=name_third.index)
    return(name_fourth)

def loadurl(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
    try:
        conn = urllib.request.urlopen(req)
        html = conn.read()
        html = html.decode('GBK')
        return html
    except urllib.error.URLError:
        return ''
    except Exception:
        print('unkown exception in conn.read()')
        return ''

if __name__ == '__main__':
    url = 'http://www.meizitu.com/a/454.html'
    picurl(url,'D:/WW')
jrotty WeChat Pay

微信打赏

jrotty Alipay

支付宝打赏

文章二维码

扫描二维码,在手机上阅读!

发表新评论
已有 6283 条评论
  1. econcorenny
    econcorenny
    回复

    medterra cbd pure cbd oil cdb oils cbd oil for sale

  2. crapyragacoup

    free casino slots http://nodepositcasinolpw.com/ - free casino games no download house of fun slots new online casinos online casino slots no download

  3. Amitsflormror

    http://nodepositcasinolpw.com/ three rivers casino http://nodepositcasinolpw.com/ - free casino games list of las vegas casinos

  4. swimarumTauff

    free penny slots with bonus spins free full casino games download rock n cash casino slots slots for real money free no deposit http://nodepositcasinolpw.com/ - casino bonus codes

  5. smagmevagype
    smagmevagype
    回复

    play free vegas casino games vegas slots free vegas casino online pop slots casino http://nodepositcasinolpw.com/ - hollywood online casino

  6. econcorenny
    econcorenny
    回复

    http://onlinecbdoilfda.com/ cbd oil for sale cbd products hemp oil for pain cbd gummies

  7. dediata
    dediata
    回复

    Parties and excessive noise are strictly prohibited, so if you experience a problem you should immediately contact the hosts. hearthealthheroes.com Ammonius, by some chanceyouhappen to be the one who provided the opening and approach for what wassaid onthat occasion.

  8. innodield
    innodield
    回复

    http://nodepositcasinolpw.com/ free casino for fun only http://nodepositcasinolpw.com/ - casino near me slots online free

  9. econcorenny
    econcorenny
    回复

    http://hempcbdoilgs.com/ cbd gummies walmart buy cbd oil online cbd for sale cbd

  10. Amitsflormror

    http://nodepositcasinolpw.com/ online casino no deposit free welcome bonus http://nodepositcasinolpw.com/ - free vegas world slots big fish casino

  11. swimarumTauff

    http://nodepositcasinolpw.com/ best place to gamble in vegas vegas world online slot machines download free casino games

  12. SeapokeSmele
    SeapokeSmele
    回复

    cbd store cbd capsules cbd gummies hemp cbd oil

  13. smagmevagype
    smagmevagype
    回复

    casino slot machine games free casino games for fun free online bingo vegas world vegas slots casino http://nodepositcasinolpw.com/ - online gambling

  14. econcorenny
    econcorenny
    回复

    hemp cbd oil http://mynewcbdoil.com/ - cbd gummies buy cbd oil cbd for dogs cbd oils

  15. RafSeinaclecoca

    http://hempcbdoilgs.com/ cbd cream http://buycbdoilsm.com/ - cbd products cbd gummies

  16. econcorenny
    econcorenny
    回复

    http://cbdoilglk.com/ cbd vape cbd drops cbd gummies cbd oils

  17. crapyragacoup

    casino games slots free http://nodepositcasinolpw.com/ casinos online show all free slots games play slots

  18. econcorenny
    econcorenny
    回复

    hemp oil cbd store cbd pills cbd gummies walmart http://onlinecbdoilfda.com/ - cbd gummies walmart

  19. innodield
    innodield
    回复

    las vegas free slots http://nodepositcasinolpw.com/ bonus casino high five casino slots free casino slots no download

  20. traicy
    traicy
    回复

    auto

博客已萌萌哒运行
© 2020 由 Typecho 强力驱动.Theme by Yodu
前篇 后篇
雷姆
拉姆