tagParser.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import pandas as pd
import re
tags = ["name", "gover", "duties", "geo", "building", "note"]
types = ["person", "nob", "king"]
def extractTagElements():
body = df.original[1]
# for i in range(2, len(sys.argv)-1):
# tagName=sys.argv[i]
for i in range(len(tags)):
tagName=tags[i]
def parseTag(text):
tagElements = re.findall(r"<"+tagName+">(.+?)</"+tagName+">",text)
return ",".join(tagElements)
df[tagName] = df.original.apply(parseTag)
for i in range(len(types)):
typeName=types[i]
def parseTag(text):
tagElements = re.findall(r'<name type="'+typeName+'">(.+?)</name>',text)
tagElements += re.findall(r'<name type=""'+typeName+'"">(.+?)</name>',text)
# print tagElements
return ",".join(tagElements)
df["name_"+typeName] = df.original.apply(parseTag)
def parseTag(text):
tagElements = re.findall(r'<note type2="author">(.+?)</note>',text)
tagElements += re.findall(r'<note type2=""author"">(.+?)</note>',text)
return ",".join(tagElements)
df["note_author"] = df.original.apply(parseTag)
def parseTag(text):
tagElements = re.findall(r'<note typepos="marginal">(.+?)</note>',text)
tagElements += re.findall(r'<note typepos=""marginal"">(.+?)</note>',text)
return ",".join(tagElements)
df["note_marginal"] = df.original.apply(parseTag)
df = pd.read_csv(sys.argv[1],sep='\t')
extractTagElements()
df.to_csv(sys.argv[1]+".extracted.tsv",sep='\t')