Python爬虫之正则表达式 PK Pyquery

jopen 9年前

抓取的目标网页:http://ypk.39.net/2017019/manual

主要抓取内容为药品说明书内容

下面先给出正则表达式的抓取方式:

#-*- coding:gbk -*-    import urllib2  import re  import random  import os    def ziduantiqu(page):      pattern1 = re.compile('<div class="tab_box">(.*?)批准文号.*?<dd>(.*?)<.*?</div>',re.S)      pattern2 = re.compile('<div class="tab_box">(.*?)<dt>(.*?)</div>',re.S)      items1 = re.findall(pattern1,page)      for item1 in items1:          filename = re.sub("[^A-Z0-9]","",item1[1])          if (filename+'.txt') in os.listdir("E:/yaopintong1/"):              filename = filename + '_1'          print filename      items2 = re.findall(pattern2,page)      for item2 in items2:          #print item2[1]          content = re.sub("[\s]","",item2[1])          content = content.replace('</dt>','\n')          content = content.replace('</strong><br/>','')          content = content.replace('<br/>','\n')          content = content.replace('</p>','\n')          content = content.replace('</dd>','\n')          content = content.replace('</dd>','\n')                    content = content.replace('&nbsp;','')          dr = re.compile(r'<[^>]+>',re.S)          dd = dr.sub('',content)          #dd = dd.replace('\n\n',',')          #dd = dd.replace(' ','')          print dd          f1 = open('E:/yaopintong1/'+filename+'.txt','a')          f1.write(dd)          f1.close()    def proxy():             proxylist = ('59.39.88.190:8080',                  '59.41.154.148:3128',                  '59.41.239.13:9797',                  '59.42.251.197:80',                  '59.42.251.214:80',                  '59.42.251.215:80',                  '59.42.251.216:80',                  '59.49.145.151:3128',                  '59.49.248.216:3128')      ii = random.randint(0,8)      print ii      proxy = proxylist[ii]      proxies = {'http': proxy}      proxy_support = urllib2.ProxyHandler(proxies)      opener = urllib2.build_opener(proxy_support)      urllib2.install_opener(opener)            if __name__ == '__main__':            file = open("E://url2.txt")      i = 1      while 1:          line = file.readline().rstrip()          if not line:              break          print '开始抓取第---' +str(i) + '------页内容'           proxy()                 url = line + 'manual'          user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'             headers = { 'User-Agent' : user_agent }            request = urllib2.Request(url,headers = headers)          try:              response = urllib2.urlopen(request, timeout = 30)              page = response.read()          except Exception, e:              print Exception,":",e              f1 = open('E:/url2_error.txt','a')              f1.write(line+ '\n')              f1.close()          else:               ziduantiqu(page)          print '第---' +str(i) + '------页内容抓取完成'          i = i + 1



上面的程序是批量采集的,其中网页链接是从记事本文件中读取的,里面一大堆匹配、替换等操作,好恶心有没有。

下面给出pyquery模块代码

>>> from pyquery import PyQuery as pq  >>> from lxml import etree  >>> import re  >>> v_source = pq(url = 'http://ypk.39.net/2017019/manual')  >>> for data in v_source('div').filter('.tab_box'):   for i in range(len(pq(data).find('dt'))):    f = open('yaopin.txt','a')    f.write(re.sub("[\s]","",pq(data).find('dt').eq(i).text().encode('utf8')))    f.write('\n')    f.write(pq(data).find('dd').eq(i).text().encode('utf8'))    f.write('\n')    f.close()    print pq(data).find('dt').eq(i).text()    print pq(data).find('dd').eq(i).text()



来自: http://my.oschina.net/dfsj66011/blog/598826