import glob
import json
import os
import re
import sys
from tqdm import tqdm
def main(argv):
src_path = argv[0]
dst_path = argv[1]
file_list = list(glob.iglob('%s/**/wiki_*' % src_path, recursive=True))
for filename in tqdm(file_list):
relpath = os.path.relpath(filename, src_path)
file = open(filename, 'r', encoding='utf-8')
lines = file.readlines()
output = []
for line in lines:
data = json.loads(line)
pattern = re.compile('<a href=\\"([^>]+)\\">(?:\\*|분류:([^<]+))<\\/a>')
finded = pattern.findall(data['text'])
data['text'] = pattern.subn('', data['text'])[0]
data['text'] = data['text'].replace('\n\n', '\n')
categories = []
for pat in finded:
if len(pat[1]) == 0:
categories.append(data['title'])
else:
categories.append(pat[1])
data['categories'] = categories
output.append(data)
output_path = os.path.join(dst_path, relpath)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as converted:
json.dump(output, converted, ensure_ascii=False)
if __name__ == "__main__":
main(sys.argv[1:])
python D:\tech\wikiextractor\WikiExtractor.py kowiki-20170320-pages-articles-multistream.xml -l -ns 분류 --json -o output