spaCy 사용하기 - multi-processing 과 serialization
2018. 4. 22. 07:54ㆍProgramming/python
spaCy는 아래와 같이 nlp.pipe를 이용해서 병렬 처리가 가능하다.
for doc in nlp.pipe(texts, batch_size=10000, n_threads=3):
pass
아래의 코드는 Joblib와 spaCy를 이용해서 multi process를 구현한 코드이다. 자세히는 잘 모르겠다. -_-
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf8 | |
"""Example of multi-processing with Joblib. Here, we're exporting | |
part-of-speech-tagged, true-cased, (very roughly) sentence-separated text, with | |
each "sentence" on a newline, and spaces between tokens. Data is loaded from | |
the IMDB movie reviews dataset and will be loaded automatically via Thinc's | |
built-in dataset loader. | |
Compatible with: spaCy v2.0.0+ | |
""" | |
from __future__ import print_function, unicode_literals | |
from toolz import partition_all | |
from pathlib import Path | |
from joblib import Parallel, delayed | |
import thinc.extra.datasets | |
import plac | |
import spacy | |
@plac.annotations( | |
output_dir=("Output directory", "positional", None, Path), | |
model=("Model name (needs tagger)", "positional", None, str), | |
n_jobs=("Number of workers", "option", "n", int), | |
batch_size=("Batch-size for each process", "option", "b", int), | |
limit=("Limit of entries from the dataset", "option", "l", int)) | |
def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000, | |
limit=10000): | |
nlp = spacy.load(model) # load spaCy model | |
print("Loaded model '%s'" % model) | |
if not output_dir.exists(): | |
output_dir.mkdir() | |
# load and pre-process the IMBD dataset | |
print("Loading IMDB data...") | |
data, _ = thinc.extra.datasets.imdb() | |
texts, _ = zip(*data[-limit:]) | |
print("Processing texts...") | |
partitions = partition_all(batch_size, texts) | |
executor = Parallel(n_jobs=n_jobs) | |
do = delayed(transform_texts) | |
tasks = (do(nlp, i, batch, output_dir) | |
for i, batch in enumerate(partitions)) | |
executor(tasks) | |
def transform_texts(nlp, batch_id, texts, output_dir): | |
print(nlp.pipe_names) | |
out_path = Path(output_dir) / ('%d.txt' % batch_id) | |
if out_path.exists(): # return None in case same batch is called again | |
return None | |
print('Processing batch', batch_id) | |
with out_path.open('w', encoding='utf8') as f: | |
for doc in nlp.pipe(texts): | |
f.write(' '.join(represent_word(w) for w in doc if not w.is_space)) | |
f.write('\n') | |
print('Saved {} texts to {}.txt'.format(len(texts), batch_id)) | |
def represent_word(word): | |
text = word.text | |
# True-case, i.e. try to normalize sentence-initial capitals. | |
# Only do this if the lower-cased form is more probable. | |
if text.istitle() and is_sent_begin(word) \ | |
and word.prob < word.doc.vocab[text.lower()].prob: | |
text = text.lower() | |
return text + '|' + word.tag_ | |
def is_sent_begin(word): | |
if word.i == 0: | |
return True | |
elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'): | |
return True | |
else: | |
return False | |
if __name__ == '__main__': | |
plac.call(main) |
또한 spaCy에서는 model을 저장 혹은 불러올 수가 있다.
# 모델 저장
text = open('customer_feedback_627.txt', 'r').read()
doc = nlp(text)
doc.to_disk('/customer_feedback_627.bin')
# 모델 load
from spacy.tokens import Doc
from spacy.vocab import Vocab
doc = Doc(Vocab()).from_disk('/customer_feedback_627.bin')