王锋 2009年12月03日 星期四 22:24 | 4641次浏览 | 2条评论
一下是一个抓取 http://www.bidders.co.jp 信息的网站,现在把源代码贴出来希望大家多多指点。其中包含一个 配置文件没有列出。生成的是配置文件。
 #-*-coding:utf-8-*-
 
 from BeautifulSoup import BeautifulSoup
 
 import urllib2
 
 import re
 
 import string
 
 def get_links(url,id):
 
     print url
 
     html=urllib2.urlopen(url.strip())
 
     soup=BeautifulSoup(html)
 
     http="
 
  http://www.bidders.co.jp
 
 "
 
     soup_div=soup.find('div',{"class":"mainlist_box_cup"})
 
     soup_td=soup_div.findAll('td',colspan="2")
 
     l= len(soup_td)
 
     link=[]
 
     for i in range(l):
 
         http="
 
  http://www.bidders.co.jp
 
 "
 
         linkstr=http+soup_td[i].a['href'].strip()
 
         #linklist= list(word for word in linkstr.split("/"))
 
         link.append(linkstr)
 
     print link
 
     return (link,id) 
 def judge_link(link,id):
 
     while link!=[]:
 
         print "judge_link"
 
         linkv=[]
 
         for slink in  link:
 
             print "slink=",slink, "id=",id
 
             link_list =list(word for word in slink.split('/'))
 
             print link_list
 
             http=slink
 
             id=id
 
             if link_list[-2]=="list1":
 
                 text="text2"
 
                 print "text2"
 
            
 
                 write(text,http,id)
 
             elif link_list[-2]=="categ":
 
                 linkv.append(http)
 
         print linkv,"循环1"
 
         tlink=[]       
 
         for linkc in linkv:
 
             (clink,id)=get_links(linkc,id)
 
             for word in clink:
 
                 tlink.append(word)
 
         link=tlink       
 
         print link, "循环"
 
                # text="text1"
 
                # print "text1"
 
                # write(text,http,id)
 
 def write(text,http,id):
     f1=open(text,'r')
 
     lines= f1.readlines()
 
     str=""
 
    
 
     if lines==[]:
 
         print "write1"
 
         content = "%s|%s\n"%(http,id)
 
         f=open(text,"a")
 
         f.write(content)
 
         f.close()
 
     else:
 
         content = "%s|%s\n"%(http,id)
 
         f=open(text,"a")
 
         f.write(content)
 
         f.close()        
 
 def get_link(ctet):
 
     f=open(ctet,'r')
 
     lines=f.readlines()
 
     f.close()
 
     http=[]
 
     url=[]
 
     id=[]
 
     #print lines
 
     for line in lines :
 
         temlist=list(word for word in str(line).split("|"))
 
         #print temlist
 
         url.append(temlist[0].strip())
 
         id.append(temlist[1].strip())
 
         http=[url,id]
 
     print http
 
     return http
 
 def create_file():
 
     f1=open("text1","w")
 
     f2=open('text2',"w")  
 
     f1.close()
 
     f2.close()
 
     print "files have been created"
 
 def sort():
 
       create_file()
 
       ctet='ctet'
 
       http=get_link(ctet)
 
       l=len(http[0])
 
       for i in range(l):
 
           url=http[0][i]
 
           id=http[1][i]
 
           m=0
 
           while m<=5:
 
               try:
 
                   print "first"
 
                   (links,id)=get_links(url,id)
 
                   judge_link(links,id)
 
                   break
 
               except :
 
                   m=m+1
 
                   continue
 
       d="done"   
 
       return d
 
 def write_begin():
 
     f=open("bidder_jp.xml","w")
 
     f.close()
 
     from  xml.dom import minidom,Node
 
     # def write_head():
 
     #    impl = minidom.getDOMImplementtation()
 
     #    root = dom.documentElement
 
     #  dom = impl.createDocument(None,'config',None)
 
     doc =minidom.Document()
 
     config=doc.createElement('config')
 
     doc.appendChild(config)
 
     parameters=doc.createElement("parameters")
 
     config.appendChild(parameters)
 
     categories=doc.createElement("categories")
 
     config.appendChild(categories)
 
     pa0=doc.createElement("parameter")
 
     pa1=doc.createElement("parameter")
 
     pa2=doc.createElement("parameter")
 
     parameters.appendChild(pa0)
 
     parameters.appendChild(pa1)
 
     parameters.appendChild(pa2)
 
     #print doc.toprettyxml(indent = " ")
 
     pa0.setAttribute("name","name")
 
     pa0.setAttribute("value","url")
 
     pa1.setAttribute("name","sourceid")
 
     pa1.setAttribute("value","4")
 
     pa2.setAttribute("name","table")
 
     pa2.setAttribute("value","Tables/ArticleAdd.xml")
 
     #print doc.toprettyxml(indent = " ")
 
     s=sort()
 
     s = "done"
 
     if s=="done":
 
         f=open("text2","r")
 
         for eachline in f.readlines():
 
             print eachline
 
             # exit(0)
 
             url=list(word for word in eachline.split("|"))
 
             #print url
 
             #exit(0)
 
             http=str(url[0].strip())
 
             id=str(url[1].strip())
 
             category=doc.createElement("category")
 
             cat=doc.createElement("cat")
             category.setAttribute("starturl", http)
 
             cat.setAttribute("id",id)
 
             categories.appendChild(category)
 
             category.appendChild(cat)
 
     print doc.toprettyxml(indent = " ")
 
     f.close()
 
     f1=open("bidder_jp.xml","w")
 
     f1.write(doc.toprettyxml())
 
     f1.close()
 
     import codecs
 
     '''writer = codecs.lookup("utf-8")[3](f1)
 
     doc.toprettyxml().writexml(writer,"\t","\t","\n",encoding="utf-8")
 
     writer.close()'''
 def final_result():
 
     print "finlal"
 
     ctet="text1"
 
     sec_http=get_link(ctet)
 
     http=get_link(ctet)
 
     l=len(http[0])
 
     for i in range(l):
 
         url=http[0][i]
 
         id=http[1][i]
 
         (links,id)=get_links(url,id)
 
         judge_link(links,id)
 
 if __name__=="__main__":
 
     write_begin()   
Zeuux © 2025
京ICP备05028076号
回复 peter 2009年12月05日 星期六 01:43
再次感谢
回复 王锋 2009年12月05日 星期六 09:56