虫言虫语 虫言虫语

Python编写单线程爬虫

in Pythonread (19030) 文章转载请注明来源!
    #!/usr/bin/env/ python3
import re
import urllib.request
import os

def picurl(url,path):
    if os.path.exists(path):
        print(path + 'exist')
    else:
        os.makedirs(path)
    while True:
        html = loadurl(url)
        if html == '':
            print('load' + url + 'error')
            continue
        else:
            break
    pic_list(re_pic(url),path)

def save_pic(url,path):
    searchname = '.*/(.*?.jpg)|(.*?.gif)'
    name = re.findall(searchname,url)
    filename = path + "/" + name[0][0]
    print(filename + ':start')
    tryTimes = 3
    while tryTimes != 0:
        tryTimes -= 1
        if os.path.exists(filename):
            print(filename + 'exists,skip')
            return True
        else:
            print(filename)
            open(filename,'wb')
        if download(url,filename):
            break
    if tryTimes != 0:
        print(filename + ': Over')
    else:
        print(url + ': Failed to download')

def download(url,filename):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
    try:
        conn = urllib.request.urlopen(req,timeout=5)
        with open(filename,'wb') as f:
            f.write(conn.read())
            f.close()
        return True
    except urllib.error.URLError:
        print('load' + url + 'error')
        return False
    except Exception:
        print('unkown exception in conn.read()')
        return ''
def pic_list(picList,path):
    a = ' '.join(picList)
    b = re.findall('(\w+://[^\s]+)',a)
    for picurl in b:
        save_pic(picurl,path)

def re_pic(url):
    name_third = []
    name_fourth = []
    searchname = '<img alt=\"([^\"]*)\"( class="scrollLoading")?(  style="width:100%;")?\s*src=\"(\w+://[^\s]+)\"( title=\"([^\"]*)\")? />'
    name = re.findall(searchname,loadurl(url))
    for i in range(len(name)):
        name_first = name[i]
        for e in range(len(name_first)):
            name_second = name_first[e]
            if 'class' in name_second or 'style' in name_second or 'title' in name_second:
                continue
            else:
                name_third.append(name_second)
    name_fourth = sorted(set(name_third),key=name_third.index)
    return(name_fourth)

def loadurl(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
    try:
        conn = urllib.request.urlopen(req)
        html = conn.read()
        html = html.decode('GBK')
        return html
    except urllib.error.URLError:
        return ''
    except Exception:
        print('unkown exception in conn.read()')
        return ''

if __name__ == '__main__':
    url = 'http://www.meizitu.com/a/454.html'
    picurl(url,'D:/WW')
jrotty WeChat Pay

微信打赏

jrotty Alipay

支付宝打赏

文章二维码

扫描二维码,在手机上阅读!

发表新评论
已有 6284 条评论
  1. smagmevagype

    cbd hemp cbd near me buy cbd oil online cbd oil http://cbd-online.us.com/ - cbd gummies walmart

  2. smagmevagype

    cbd buy hemp oil cbd for dogs cdb oils http://cbd-online.us.com/ - cbd online

  3. swimarumTauff

    cbd online cbd near me cbd oil store medterra cbd

  4. crapyragacoup

    cbd oil at walmart buy cbd oil online cbd for dogs cbd pure

  5. swimarumTauff

    cbd pills http://cbd-oils.us.com/ - cdb oils cbd hemp cbd store

  6. smagmevagype

    cannabis oil http://cbd-online.us.com/ - cbd capsules hemp oil cbd vape

  7. Amitsflormror

    cbd hemp cbd store cbd best cbd oil

  8. crapyragacoup

    hemp cbd oil cbd oil for pain cbd oil cbd oil benefits http://cbdoilstore.us.com/ - cbd store

  9. swimarumTauff

    http://cbd-oils.us.com/ buy cbd http://cbd-oils.us.com/ - hemp oil for pain hemp cbd oil

  10. Amitsflormror

    http://cbd-hemp.us.com/ cbd oil for dogs cbd gummies walmart pure cbd oil cbd oil for dogs

  11. dbniLiene
    dbniLiene
    回复

    generic cialis https://valcialisns.com/ - buy cialis cialis cialis online cialis online

  12. lqbtbCrarl
    lqbtbCrarl
    回复

    cialis price http://genericcialls.com/ - cialis pills cheap cialis cialis daily cialis online

  13. Amitsflormror

    http://cbd-hemp.us.com/ cbd for sale http://cbd-hemp.us.com/ - cbd pills cbd capsules

  14. mejzCrarl
    mejzCrarl
    回复

    generic cialis http://pharmcilisa.com/ - generic cialis cialis cialis daily cheap cialis

  15. smagmevagype

    cbd vape http://cbd-online.us.com/ buy cbd oil online cbd oil online cbd for dogs

  16. smagmevagype

    cbd pure http://cbd-online.us.com/ cannabis oil hemp oil cbd near me

  17. crapyragacoup

    http://cbdoilstore.us.com/ cbd oils cbd cream cbd hemp cbd store

  18. swimarumTauff

    http://cbd-oils.us.com/ cbd oil for dogs http://cbd-oils.us.com/ - cbd hemp cbd products

  19. crapyragacoup

    cbd cbd for sale cbd vape cbd pills http://cbdoilstore.us.com/ - cbd oil store

  20. swimarumTauff

    cbd medic http://cbd-oils.us.com/ - best cbd oil buy cbd oil buy cbd cbd pure cbd drops

博客已萌萌哒运行
© 2020 由 Typecho 强力驱动.Theme by Yodu
前篇 后篇
雷姆
拉姆