#!/usr/bin/env python3 import sys,re if(len(sys.argv) > 1): in_file = open(sys.argv[1]) else: in_file = sys.stdin pn_dict = {} in_pronunciation = False definition = "" pn_lines = [] sub_dict = {} for line in in_file: if line.startswith(" "): #print(line,end='') if len(sub_dict) > 0: #print(definition,sub_dict) pn_dict[definition] = sub_dict #print("".join(pn_lines),end='') pn_lines = [] sub_dict = {} definition = re.search("<title>(.*)",line).group(1) elif line.startswith("===Pronunciation") or line.startswith("====Pronunciation"): in_pronunciation = True elif in_pronunciation and line.startswith("=="): in_pronunciation = False elif in_pronunciation: pn_lines.append(line) #print(line,end='') match_ipa = re.search("({{IPA.*?}})",line) if match_ipa: match_a = re.search("{{a\|(.*?)}}",line) ipa = match_ipa.group(1) if match_a: accent = match_a.group(1) else: accent = "NA" match_lang = re.search("lang=([a-z]*)",ipa) if match_lang: lang = match_lang.group(1) else: lang = "en" #print("accent:"+accent+" ipa: "+ipa) key = (lang,accent) if key in sub_dict: #print("Duplicate accent ",accent) accent_set = set() old = sub_dict[key] if type(old) == type(set()): accent_set.update(old) else: accent_set.add(old) accent_set.add(ipa) sub_dict[key] = accent_set else: sub_dict[key] = ipa print(pn_dict)