Changeset 53919a34f793…
Parent 5ebe6fe359f0…
by Benjamin Pollack
Changes to one file · Browse files at 53919a34f793 Showing diff from parent 5ebe6fe359f0 Diff from another changeset...
@@ -1,20 +1,29 @@ #!/usr/bin/env python
import os
+import re
+import sys
from BeautifulSoup import BeautifulSoup
def cleanup(s):
- return s.replace('\n', ' ').replace('\r', ' ')
+ s = s.replace('\n', ' ').replace('\r', ' ')
+ s = re.sub('https?://[^ ]+', '', s)
+ # Lame sentence parser; good enough for these purposes
+ lines = [l.strip() for l in re.findall('.+?[?!.] ', s)]
+ return [l for l in lines if l]
def main():
- with open('out.txt', 'w') as f:
+ with open('megahal.lrn', 'w') as f:
for name in os.listdir('logs'):
s = BeautifulSoup(open(os.path.join('logs', name)).read(), convertEntities=BeautifulSoup.HTML_ENTITIES)
for td in s.findAll('td', {'class': 'message'}):
try:
- f.write('%s\n' % cleanup(td.text))
- except:
+ lines = cleanup(td.text)
+ if lines:
+ dump = '\n'.join(lines)
+ f.write('%s\n' % dump)
+ except UnicodeEncodeError:
pass
|
Loading...