虫言虫语 虫言虫语

Python编写单线程爬虫

in Pythonread (8268) 文章转载请注明来源!
    #!/usr/bin/env/ python3
import re
import urllib.request
import os

def picurl(url,path):
    if os.path.exists(path):
        print(path + 'exist')
    else:
        os.makedirs(path)
    while True:
        html = loadurl(url)
        if html == '':
            print('load' + url + 'error')
            continue
        else:
            break
    pic_list(re_pic(url),path)

def save_pic(url,path):
    searchname = '.*/(.*?.jpg)|(.*?.gif)'
    name = re.findall(searchname,url)
    filename = path + "/" + name[0][0]
    print(filename + ':start')
    tryTimes = 3
    while tryTimes != 0:
        tryTimes -= 1
        if os.path.exists(filename):
            print(filename + 'exists,skip')
            return True
        else:
            print(filename)
            open(filename,'wb')
        if download(url,filename):
            break
    if tryTimes != 0:
        print(filename + ': Over')
    else:
        print(url + ': Failed to download')

def download(url,filename):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
    try:
        conn = urllib.request.urlopen(req,timeout=5)
        with open(filename,'wb') as f:
            f.write(conn.read())
            f.close()
        return True
    except urllib.error.URLError:
        print('load' + url + 'error')
        return False
    except Exception:
        print('unkown exception in conn.read()')
        return ''
def pic_list(picList,path):
    a = ' '.join(picList)
    b = re.findall('(\w+://[^\s]+)',a)
    for picurl in b:
        save_pic(picurl,path)

def re_pic(url):
    name_third = []
    name_fourth = []
    searchname = '<img alt=\"([^\"]*)\"( class="scrollLoading")?(  style="width:100%;")?\s*src=\"(\w+://[^\s]+)\"( title=\"([^\"]*)\")? />'
    name = re.findall(searchname,loadurl(url))
    for i in range(len(name)):
        name_first = name[i]
        for e in range(len(name_first)):
            name_second = name_first[e]
            if 'class' in name_second or 'style' in name_second or 'title' in name_second:
                continue
            else:
                name_third.append(name_second)
    name_fourth = sorted(set(name_third),key=name_third.index)
    return(name_fourth)

def loadurl(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
    try:
        conn = urllib.request.urlopen(req)
        html = conn.read()
        html = html.decode('GBK')
        return html
    except urllib.error.URLError:
        return ''
    except Exception:
        print('unkown exception in conn.read()')
        return ''

if __name__ == '__main__':
    url = 'http://www.meizitu.com/a/454.html'
    picurl(url,'D:/WW')
jrotty WeChat Pay

微信打赏

jrotty Alipay

支付宝打赏

文章二维码

扫描二维码,在手机上阅读!

发表新评论
已有 3904 条评论
  1. crapyragacoup

    slots games online slots vegas casino slots casino game http://onlinecasinosvkk.com/ - online casino games

  2. innodield
    innodield
    回复

    vegas casino slots play slots online casino slots

  3. swimarumTauff

    best online casinos http://onlinecasinosvkk.com/ slots free online casino free casino games

  4. econcorenny
    econcorenny
    回复

    cbd capsules http://buycbdoilsm.com/ - buy cbd oil cbd oil store cbd oil hemp cbd oil

  5. rxjcdred
    rxjcdred
    回复

    federal funding of viagra cost of viagra at walmart female viagra pills viagra considerations viagra active ingredientlomaira viagra interaction he cheated stepsis by giving her viagra in the drink viagra online girl in viagra commercial blue dress baby bonus mayor french viagrahttps://pharm-usa-official.com - viagra dosagehttps://hainanlife.info/hainan_today/news_hainan/rafting_cycling_and_running_in_wuzhishan_451712/?strIMessage=Comment+submitted+for+moderation
    http://www.radioveronicaone.it/2018/12/12/anastasio-e-il-vincitore-di-x-factor-2018/?unapproved=344069&moderation-hash=24caab0341e81f6b78d0ff0ab38ae40a#comment-344069
    https://onincome.net/2019/07/29/7-ways-to-make-money-fast-if-youre-in-a-hurry/#comment-20540
    https://thelilyhoneylife.com/life-lately/?unapproved=278884&moderation-hash=3829a371412e474652f97f345b3e8adc#comment-278884
    http://senclic.com/politique/presidentielle-cellou-dalein-diallo-soutient-macky-sall-conde/?unapproved=1923558&moderation-hash=bf033e53175a58ae87419fe85eed23c9#comment-1923558

  6. econcorenny
    econcorenny
    回复

    hemp cbd cdb oils cbd store medterra cbd

  7. smagmevagype
    smagmevagype
    回复

    slot games casino online real money casino play slots online http://onlinecasinosvkk.com/ - free slots

  8. RafSeinaclecoca

    cbd oil benefits http://mynewcbdoil.com/ - buy cbd oil online cbd tinctures cbd pure

  9. econcorenny
    econcorenny
    回复

    cbd near me cbd oil benefits cbd oil for dogs

  10. SeapokeSmele
    SeapokeSmele
    回复

    cbd store http://cbdoilwshop.com/ - cbd for dogs cbd pure cbd near me

  11. Amitsflormror

    online slots http://onlinecasinosvkk.com/ slots games free play online casino play casino

  12. econcorenny
    econcorenny
    回复

    cannabis oil best cbd oil buy cbd oil cbd online cbd oil online http://cbdoilyeu.com/ - cbd oil store

  13. econcorenny
    econcorenny
    回复

    cbd hemp cannabis oil cbd for dogs hemp cbd http://mynewcbdoil.com/ - cbd for dogs

  14. crapyragacoup

    http://onlinecasinosvkk.com/ free slots vegas casino slots vegas slots online slot games

  15. smagmevagype
    smagmevagype
    回复

    online casino slots casino slots play online casino online casino gambling

  16. RafSeinaclecoca

    buy hemp oil hemp cbd oil medterra cbd

  17. econcorenny
    econcorenny
    回复

    buy hemp oil http://cbdoilglk.com/ cbd for dogs cbd pure cbd tinctures

  18. SeapokeSmele
    SeapokeSmele
    回复

    cbd oil for dogs http://cbdoilwshop.com/ - cbd vape cannabis oil cbd products cbd pills

  19. Amitsflormror

    free casino http://freecasinosmq.com/ - play slots online casino online slots online casino bonus

  20. econcorenny
    econcorenny
    回复

    http://cbdoilusds.com/ hemp cbd oil http://cbdoilusds.com/ - cbd capsules cbd

博客已萌萌哒运行
© 2020 由 Typecho 强力驱动.Theme by Yodu
前篇 后篇
雷姆
拉姆