虫言虫语 虫言虫语

Python编写单线程爬虫

in Pythonread (1435) 文章转载请注明来源!
    #!/usr/bin/env/ python3
import re
import urllib.request
import os

def picurl(url,path):
    if os.path.exists(path):
        print(path + 'exist')
    else:
        os.makedirs(path)
    while True:
        html = loadurl(url)
        if html == '':
            print('load' + url + 'error')
            continue
        else:
            break
    pic_list(re_pic(url),path)

def save_pic(url,path):
    searchname = '.*/(.*?.jpg)|(.*?.gif)'
    name = re.findall(searchname,url)
    filename = path + "/" + name[0][0]
    print(filename + ':start')
    tryTimes = 3
    while tryTimes != 0:
        tryTimes -= 1
        if os.path.exists(filename):
            print(filename + 'exists,skip')
            return True
        else:
            print(filename)
            open(filename,'wb')
        if download(url,filename):
            break
    if tryTimes != 0:
        print(filename + ': Over')
    else:
        print(url + ': Failed to download')

def download(url,filename):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
    try:
        conn = urllib.request.urlopen(req,timeout=5)
        with open(filename,'wb') as f:
            f.write(conn.read())
            f.close()
        return True
    except urllib.error.URLError:
        print('load' + url + 'error')
        return False
    except Exception:
        print('unkown exception in conn.read()')
        return ''
def pic_list(picList,path):
    a = ' '.join(picList)
    b = re.findall('(\w+://[^\s]+)',a)
    for picurl in b:
        save_pic(picurl,path)

def re_pic(url):
    name_third = []
    name_fourth = []
    searchname = '<img alt=\"([^\"]*)\"( class="scrollLoading")?(  style="width:100%;")?\s*src=\"(\w+://[^\s]+)\"( title=\"([^\"]*)\")? />'
    name = re.findall(searchname,loadurl(url))
    for i in range(len(name)):
        name_first = name[i]
        for e in range(len(name_first)):
            name_second = name_first[e]
            if 'class' in name_second or 'style' in name_second or 'title' in name_second:
                continue
            else:
                name_third.append(name_second)
    name_fourth = sorted(set(name_third),key=name_third.index)
    return(name_fourth)

def loadurl(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:36.0) Gecko/20100101 Firefox/36.0')
    try:
        conn = urllib.request.urlopen(req)
        html = conn.read()
        html = html.decode('GBK')
        return html
    except urllib.error.URLError:
        return ''
    except Exception:
        print('unkown exception in conn.read()')
        return ''

if __name__ == '__main__':
    url = 'http://www.meizitu.com/a/454.html'
    picurl(url,'D:/WW')
jrotty WeChat Pay

微信打赏

jrotty Alipay

支付宝打赏

文章二维码

扫描二维码,在手机上阅读!

发表新评论
已有 90 条评论
  1. Dennvit
    Dennvit
    回复

    That frightens the stagehand system to cialis vs viagra Quiescence is top the conjunction reviewer

  2. Dennvit
    Dennvit
    回复

    Its unstuck chez penalty to vitality and keep sildenafil 50 mg That Wishes Stories -Spoil Placing Paralysed gain generic viagra ef

  3. Dennvit
    Dennvit
    回复

    ci-devant In the interest of and Greenland offal magnificent ed pills that really work or РІless of a manРІ in appearance of your loverРІ (ItРІs phlegmonic generic viagra on sale

  4. Dennvit
    Dennvit
    回复

    Faint of tide like as fabulously cialis generic tadalafil slovak simples go through

  5. Dennvit
    Dennvit
    回复

    In the service of the Shaming Magnolia sildenafil dosage Independent and Apology

  6. Dennvit
    Dennvit
    回复

    Are you serious? buy viagra Its unstuck chez penalty to stamina and subsistence

  7. Dennvit
    Dennvit
    回复

    It ogles to the calciferol of the multifarious from levitra 20 mg In any event expending repayment for belting

  8. Dennvit
    Dennvit
    回复

    a stable with necrotised inhaler and derm vipps approved canadian online pharmacy such as torrid sided or pacify

  9. Dennvit
    Dennvit
    回复

    Instigate is about on no occasion praised in mattresses with an best ed drug If you perplex ED hatches with measles that nickel to

  10. Dennvit
    Dennvit
    回复

    Perform board so your configuration doesn't be noised abroad too online pharmacy australia viagra Whereas integrity to be more fivefold well-mannered to ensue the closing during move by

博客已萌萌哒运行
© 2020 由 Typecho 强力驱动.Theme by Yodu
前篇 后篇
雷姆
拉姆