Python抓取花瓣网图片脚本

jopen 11年前

#!/usr/bin/env python  # -*- encoding:utf-8 -*-  # author :insun  #http://yxmhero1989.blog.163.com/blog/static/112157956201311994027168/  import urllib, urllib2, re, sys, os  reload(sys)      #url = 'http://huaban.com/favorite/'  if(os.path.exists('beauty') == False):      os.mkdir('beauty')    def get_huaban_beauty():      pin_id = 48145457      limit = 20 #他默认允许的limit为100      while pin_id != None:          url = 'http://huaban.com/favorite/beauty/?max=' + str(pin_id) + '&limit=' + str(limit) + '&wfl=1'          try:              i_headers = {"User-Agent": "Mozilla/5.0(Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1)\              Gecko/20090624 Firefox/3.5", \              "Referer": 'http://baidu.com/'}              req = urllib2.Request(url, headers=i_headers)              html = urllib2.urlopen(req).read()              reg = re.compile('"pin_id":(.*?),.+?"file":{"farm":"farm1", "bucket":"hbimg",.+?"key":"(.*?)",.+?"type":"image/(.*?)"', re.S)              groups = re.findall(reg, html)              print str(pin_id) + "Start to catch " + str(len(groups)) + " photos"              for att in groups:                  pin_id = att[0]                  att_url = att[1] + '_fw554'                  img_type = att[2]                  img_url = 'http://img.hb.aicdn.com/' + att_url                  if(urllib.urlretrieve(img_url, 'beauty/' + att_url + '.' + img_type)):                      print img_url + '.' + img_type + ' download success!'                  else:                      print img_url + '.' + img_type + ' save failed'  #print pin_id          except:              print 'error occurs'      get_huaban_beauty()