def cleantag(rowText):
cleanr =re.compile('<[^<]+>')
# cleanr =re.compile('<.*?>')
cleantext = re.sub(cleanr,'', rowText)
return cleantext.replace(' ','').replace('_x000D_','')
for subdir, dirs, files in os.walk(rootdir):
for file in files:
currentFile = os.path.join(subdir, file)
outFile = os.path.join(outdir, file)
if(not currentFile.endswith(".tsv")):
continue
print currentFile
with open(outFile+'_combine.tsv', 'w') as tsvfile:
fieldnames = ['id', 'original', 'clean']
writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=fieldnames)
writer.writeheader()
with open(currentFile, 'rb') as tsvfile:
pages = csv.reader(tsvfile, delimiter='\t')
for row in pages:
row1 = row[1]
noTags = cleantag(row1)
cleanOriginal = row1.replace('_x000D_','')
writer.writerow({fieldnames[0]: row[0], fieldnames[1]: cleanOriginal, fieldnames[2]: noTags})