王锋的博客

beautifulsoup 的应用例子。

王锋 2009年12月03日星期四 22:24 | 4583次浏览 | 2条评论

一下是一个抓取 http://www.bidders.co.jp 信息的网站，现在把源代码贴出来希望大家多多指点。其中包含一个配置文件没有列出。生成的是配置文件。

#-*-coding:utf-8-*-
from BeautifulSoup import BeautifulSoup
import urllib2
import re
import string
def get_links(url,id):
    print url
    html=urllib2.urlopen(url.strip())
    soup=BeautifulSoup(html)
    http=" http://www.bidders.co.jp "
    soup_div=soup.find('div',{"class":"mainlist_box_cup"})
    soup_td=soup_div.findAll('td',colspan="2")
    l= len(soup_td)
    link=[]
    for i in range(l):
        http=" http://www.bidders.co.jp "
        linkstr=http+soup_td[i].a['href'].strip()
        #linklist= list(word for word in linkstr.split("/"))
        link.append(linkstr)
    print link
    return (link,id)

def judge_link(link,id):
    while link!=[]:
        print "judge_link"
        linkv=[]
        for slink in link:
            print "slink=",slink, "id=",id
            link_list =list(word for word in slink.split('/'))
            print link_list
            http=slink
            id=id
            if link_list[-2]=="list1":
                text="text2"
                print "text2"

                write(text,http,id)
            elif link_list[-2]=="categ":
                linkv.append(http)
        print linkv,"循环1"
        tlink=[]
        for linkc in linkv:
            (clink,id)=get_links(linkc,id)
            for word in clink:
                tlink.append(word)
        link=tlink
        print link, "循环"
               # text="text1"
               # print "text1"
               # write(text,http,id)
def write(text,http,id):

    f1=open(text,'r')
    lines= f1.readlines()
    str=""

    if lines==[]:
        print "write1"
        content = "%s|%s\n"%(http,id)
        f=open(text,"a")
        f.write(content)
        f.close()
    else:
        content = "%s|%s\n"%(http,id)
        f=open(text,"a")
        f.write(content)
        f.close()
def get_link(ctet):
    f=open(ctet,'r')
    lines=f.readlines()
    f.close()
    http=[]
    url=[]
    id=[]
    #print lines
    for line in lines :
        temlist=list(word for word in str(line).split("|"))
        #print temlist
        url.append(temlist[0].strip())
        id.append(temlist[1].strip())
        http=[url,id]
    print http
    return http
def create_file():
    f1=open("text1","w")
    f2=open('text2',"w")
    f1.close()
    f2.close()
    print "files have been created"
def sort():
      create_file()
      ctet='ctet'
      http=get_link(ctet)
      l=len(http[0])
      for i in range(l):
          url=http[0][i]
          id=http[1][i]
          m=0
          while m<=5:
              try:
                  print "first"
                  (links,id)=get_links(url,id)
                  judge_link(links,id)
                  break
              except :
                  m=m+1
                  continue
      d="done"
      return d
def write_begin():
    f=open("bidder_jp.xml","w")
    f.close()
    from xml.dom import minidom,Node
    # def write_head():
    #    impl = minidom.getDOMImplementtation()
    #    root = dom.documentElement
    # dom = impl.createDocument(None,'config',None)
    doc =minidom.Document()
    config=doc.createElement('config')
    doc.appendChild(config)
    parameters=doc.createElement("parameters")
    config.appendChild(parameters)
    categories=doc.createElement("categories")
    config.appendChild(categories)
    pa0=doc.createElement("parameter")
    pa1=doc.createElement("parameter")
    pa2=doc.createElement("parameter")
    parameters.appendChild(pa0)
    parameters.appendChild(pa1)
    parameters.appendChild(pa2)
    #print doc.toprettyxml(indent = " ")
    pa0.setAttribute("name","name")
    pa0.setAttribute("value","url")
    pa1.setAttribute("name","sourceid")
    pa1.setAttribute("value","4")
    pa2.setAttribute("name","table")
    pa2.setAttribute("value","Tables/ArticleAdd.xml")
    #print doc.toprettyxml(indent = " ")
    s=sort()
    s = "done"
    if s=="done":
        f=open("text2","r")
        for eachline in f.readlines():
            print eachline
            # exit(0)
            url=list(word for word in eachline.split("|"))
            #print url
            #exit(0)
            http=str(url[0].strip())
            id=str(url[1].strip())
            category=doc.createElement("category")
            cat=doc.createElement("cat")

            category.setAttribute("starturl", http)
            cat.setAttribute("id",id)
            categories.appendChild(category)
            category.appendChild(cat)
    print doc.toprettyxml(indent = " ")
    f.close()
    f1=open("bidder_jp.xml","w")
    f1.write(doc.toprettyxml())
    f1.close()
    import codecs
    '''writer = codecs.lookup("utf-8")[3](f1)
    doc.toprettyxml().writexml(writer,"\t","\t","\n",encoding="utf-8")
    writer.close()'''

def final_result():
    print "finlal"
    ctet="text1"
    sec_http=get_link(ctet)
    http=get_link(ctet)
    l=len(http[0])
    for i in range(l):
        url=http[0][i]
        id=http[1][i]
        (links,id)=get_links(url,id)
        judge_link(links,id)

if __name__=="__main__":
write_begin()

分享添加到桌面