In [ ]:
def cleantag(rowText):
  cleanr =re.compile('<[^<]+>')
  # cleanr =re.compile('<.*?>')
  cleantext = re.sub(cleanr,'', rowText)
  return cleantext.replace(' ','').replace('_x000D_','')


for subdir, dirs, files in os.walk(rootdir):
    for file in files:
      currentFile = os.path.join(subdir, file)
      outFile = os.path.join(outdir, file)
      if(not currentFile.endswith(".tsv")):
        continue
      print currentFile
      with open(outFile+'_combine.tsv', 'w') as tsvfile:
          fieldnames = ['id', 'original', 'clean']
          writer = csv.DictWriter(tsvfile, delimiter='\t', fieldnames=fieldnames)
          writer.writeheader()

          with open(currentFile, 'rb') as tsvfile:
      	    pages = csv.reader(tsvfile, delimiter='\t')
      	    for row in pages:
              row1 = row[1]
              noTags = cleantag(row1)
              cleanOriginal = row1.replace('_x000D_','')
              writer.writerow({fieldnames[0]: row[0], fieldnames[1]: cleanOriginal, fieldnames[2]: noTags})