2007年06月01日 星期五 09:57
ÓÃFirefoxµÄDownload All
²å¼þ´ÓÐÂÀ˶ÁÊ鯵µÀÏÂÔØÁËÎÄÕ¡£ÏëÔÚTreo650Àï¿´´¿Îı¾¡£È»ºó¾ÍÓÃÁËÒÔÏÂÁ½¸öС³ÌÐò¡£ËãÊÇUnix˼ÏëµÄÒ»ÖÖÌåÏÖ°É¡£Ð¡¹¤¾ßÖ»×öÒ»¼þÊÂÇé¡£´ó¼Ò°Ñ×Ô¼ºµçÄÔÀïµÄС±¦±´ÄóöÀ´·ÖÏí°¡¡£»òÕßÔÚÍøÕ¾É϶àÒ»¸öWikiÒ³Ãæ£¿
#####################
#html2txt.py
#####################
from formatter import AbstractFormatter, NullWriter
from htmllib import HTMLParser
def _(str, in_encoder="gbk", out_encoder="utf8"):
return unicode(str, in_encoder).encode(out_encoder)
class myWriter(NullWriter):
def __init__(self):
NullWriter.__init__(self)
self._bodyText = []
def send_flowing_data(self, str):
self._bodyText.append(str)
def _get_bodyText(self):
return '\n'.join(self._bodyText)
bodyText = property(_get_bodyText, None, None, 'plain text from body')
class myHTMLParser(HTMLParser):
def do_meta(self, attrs):
self.metas = attrs
def convertFile(filename):
mywriter = myWriter()
absformatter = AbstractFormatter(mywriter)
parser = myHTMLParser(absformatter)
parser.feed(open(filename).read())
return ( _(parser.title), parser.formatter.writer.bodyText )
import os
import os.path
OUTPUTDIR = "./txt"
INPUTDIR = "."
if __name__ == "__main__":
if not os.path.exists(OUTPUTDIR):
os.mkdir(OUTPUTDIR)
for file in os.listdir(INPUTDIR):
if file[-4:] == '.htm' or file[-5:] == '.html':
print "Coverting", file,
outfilename = os.path.splitext(file)[0]
a, text = convertFile(file)
outfilename = outfilename + '.txt'
outfullname = os.path.join(OUTPUTDIR, outfilename)
open(outfullname, "wt").write(text)
print "Done!"
################################
#pickupcontent.py
################################
# -*- coding: utf-8 -*-
import sys
import glob
import os
import re
sys.argv[1:] = [item for arg in sys.argv[1:] for item in glob.glob(arg)]
startstr = u"^°ËÊ®".encode("gb2312") # article title
endstr = u"^\[·µ»Ø".encode("gb2312") #
tmp_start = re.compile(startstr)
tmp_end = re.compile(endstr)
for infile in sys.argv[1:]:
# print infile
f = open(infile,'r')
#print f
lines = f.readlines()
fout = ''
for index, line in enumerate(lines):
if tmp_start.match(line):
kstart = index
if tmp_end.match(line):
kend = index
break
f.close()
fout = fout.join(lines[kstart:kend])
tmp = open('tmp','w')
tmp.write(fout)
tmp.close()
os.remove(infile)
os.rename('tmp',infile)
-------------- 下一部分 --------------
Ò»¸öHTML¸½¼þ±»ÒƳý...
URL: http://python.cn/pipermail/python-chinese/attachments/20070601/2155e424/attachment.htm
2007年06月01日 星期五 10:19
On 6/1/07, Ben Luo <benluo在gmail.com> wrote: > 用Firefox的Download All > 插件从新浪读书频道下载了文章。想在Treo650里看纯文本。然后就用了以下两个小程序。算是Unix思想的一种体现吧。小工具只做一件事情。大家把自己电脑里的小宝贝拿出来分享啊。或者在网站上多一个Wiki页面? > 在啄木鸟维基一直有个"微项目" 的页面收集大家平日随手解决的小需求的开心代码; 你的收集到快乐的六.一节这天的页面了! http://wiki.woodpecker.org.cn/moin/MicroProj/2007-06-01 感谢分享! 提示,进行注释,以便帮助新人快速理解你的技巧,思路呢 ;) > ##################### > #html2txt.py > ##################### > > from formatter import AbstractFormatter, NullWriter > from htmllib import HTMLParser > > def _(str, in_encoder="gbk", out_encoder="utf8"): > return unicode(str, in_encoder).encode(out_encoder) > > > class myWriter(NullWriter): > def __init__(self): > NullWriter.__init__(self) > self._bodyText = [] > > def send_flowing_data(self, str): > self._bodyText.append(str) > > def _get_bodyText(self): > return '\n'.join(self._bodyText) > > bodyText = property(_get_bodyText, None, None, 'plain text from body') > > class myHTMLParser(HTMLParser): > def do_meta(self, attrs): > self.metas = attrs > > def convertFile(filename): > mywriter = myWriter() > absformatter = AbstractFormatter(mywriter) > parser = myHTMLParser(absformatter) > parser.feed(open(filename).read()) > return ( _(parser.title), > parser.formatter.writer.bodyText ) > > import os > import os.path > > OUTPUTDIR = "./txt" > INPUTDIR = "." > if __name__ == "__main__": > if not os.path.exists(OUTPUTDIR): > os.mkdir(OUTPUTDIR) > > for file in os.listdir(INPUTDIR): > if file[-4:] == '.htm' or file[-5:] == '.html': > print "Coverting", file, > outfilename = os.path.splitext(file)[0] > a, text = convertFile(file) > outfilename = outfilename + '.txt' > outfullname = os.path.join(OUTPUTDIR, outfilename) > open(outfullname, "wt").write(text) > print "Done!" > > ################################ > #pickupcontent.py > ################################ > > # -*- coding: utf-8 -*- > > import sys > import glob > import os > import re > > sys.argv[1:] = [item for arg in sys.argv[1:] for item in glob.glob(arg)] > startstr = u"^八十".encode("gb2312") # article title > endstr = u"^\[返回".encode("gb2312") # > tmp_start = re.compile(startstr) > tmp_end = re.compile(endstr) > for infile in sys.argv[1:]: > # print infile > f = open(infile,'r') > #print f > lines = f.readlines() > fout = '' > for index, line in enumerate(lines): > if tmp_start.match(line): > kstart = index > if tmp_end.match(line): > kend = index > break > > f.close() > fout = fout.join(lines[kstart:kend]) > tmp = open('tmp','w') > tmp.write(fout) > tmp.close() > os.remove(infile) > os.rename('tmp',infile) > > > > _______________________________________________ > python-chinese > Post: send python-chinese在lists.python.cn > Subscribe: send subscribe to > python-chinese-request在lists.python.cn > Unsubscribe: send unsubscribe to > python-chinese-request在lists.python.cn > Detail Info: > http://python.cn/mailman/listinfo/python-chinese > -- '''Time is unimportant, only life important! http://zoomquiet.org blog在http://blog.zoomquiet.org/pyblosxom/ wiki在http://wiki.woodpecker.org.cn/moin/ZoomQuiet scrap在http://floss.zoomquiet.org douban在http://www.douban.com/people/zoomq/ ____________________________________ Pls. use OpenOffice.org to replace M$ Office. http://zh.openoffice.org Pls. use 7-zip to replace WinRAR/WinZip. http://7-zip.org/zh-cn/ You can get the truely Freedom 4 software. '''
2007年06月01日 星期五 11:07
On 6/1/07, Zoom. Quiet <zoom.quiet在gmail.com> wrote: > > On 6/1/07, Ben Luo <benluo在gmail.com> wrote: > > ÓÃFirefoxµÄDownload All > > > ²å¼þ´ÓÐÂÀ˶ÁÊ鯵µÀÏÂÔØÁËÎÄÕ¡£ÏëÔÚTreo650Àï¿´´¿Îı¾¡£È»ºó¾ÍÓÃÁËÒÔÏÂÁ½¸öС³ÌÐò¡£ËãÊÇUnix˼ÏëµÄÒ»ÖÖÌåÏÖ°É¡£Ð¡¹¤¾ßÖ»×öÒ»¼þÊÂÇé¡£´ó¼Ò°Ñ×Ô¼ºµçÄÔÀïµÄС±¦±´ÄóöÀ´·ÖÏí°¡¡£»òÕßÔÚÍøÕ¾É϶àÒ»¸öWikiÒ³Ãæ£¿ > > > ÔÚ×ÄľÄñά»ùÒ»Ö±Óиö"΢ÏîÄ¿" µÄÒ³ÃæÊÕ¼¯´ó¼ÒƽÈÕËæÊÖ½â¾öµÄСÐèÇóµÄ¿ªÐÄ´úÂë; > ÄãµÄÊÕ¼¯µ½¿ìÀÖµÄÁù.Ò»½ÚÕâÌìµÄÒ³ÃæÁË! > http://wiki.woodpecker.org.cn/moin/MicroProj/2007-06-01 > > ¸Ðл·ÖÏí! > > Ìáʾ,½øÐÐ×¢ÊÍ,ÒÔ±ã°ïÖúÐÂÈË¿ìËÙÀí½âÄãµÄ¼¼ÇÉ,Ë¼Â·ÄØ ;) html2txt.py Ò²ÊÇ´ÓÍøÉÏÏÂÔØµÄ¡£Ò»¸öÖйúÅóÓÑдµÄ¡£ÎÒÖ»ÊÇÓã¬Ò²²»ÊǺÜÀí½â¡£ ÏÂÒ»¸öÎļþÎÒÔÚWikiÖÐÒÔºó×¢ÊÍ¡£ -------------- 下一部分 -------------- Ò»¸öHTML¸½¼þ±»ÒƳý... URL: http://python.cn/pipermail/python-chinese/attachments/20070601/f22cb5d4/attachment.htm
2007年06月01日 星期五 11:19
On 6/1/07, Ben Luo <benluo在gmail.com> wrote: > > > On 6/1/07, Zoom. Quiet <zoom.quiet在gmail.com> wrote: > > On 6/1/07, Ben Luo <benluo在gmail.com> wrote: > > > 用Firefox的Download All > > > > 插件从新浪读书频道下载了文章。想在Treo650里看纯文本。然后就用了以下两个小程序。算是Unix思想的一种体现吧。小工具只做一件事情。大家把自己电脑里的小宝贝拿出来分享啊。或者在网站上多一个Wiki页面? > > > > > 在啄木鸟维基一直有个"微项目" 的页面收集大家平日随手解决的小需求的开心代码; > > 你的收集到快乐的六.一节这天的页面了! > > http://wiki.woodpecker.org.cn/moin/MicroProj/2007-06-01 > > > > 感谢分享! > > > > 提示,进行注释,以便帮助新人快速理解你的技巧,思路呢 ;) > > html2txt.py 也是从网上下载的。一个中国朋友写的。我只是用,也不是很理解。 咔咔咔!!在使用中谅解,在沟通中深入,技艺就是这么获得的 ;) > 下一个文件我在Wiki中以后注释。 > > > > _______________________________________________ > python-chinese > Post: send python-chinese在lists.python.cn > Subscribe: send subscribe to > python-chinese-request在lists.python.cn > Unsubscribe: send unsubscribe to > python-chinese-request在lists.python.cn > Detail Info: > http://python.cn/mailman/listinfo/python-chinese > -- '''Time is unimportant, only life important! http://zoomquiet.org blog在http://blog.zoomquiet.org/pyblosxom/ wiki在http://wiki.woodpecker.org.cn/moin/ZoomQuiet scrap在http://floss.zoomquiet.org douban在http://www.douban.com/people/zoomq/ ____________________________________ Pls. use OpenOffice.org to replace M$ Office. http://zh.openoffice.org Pls. use 7-zip to replace WinRAR/WinZip. http://7-zip.org/zh-cn/ You can get the truely Freedom 4 software. '''
Zeuux © 2025
京ICP备05028076号