[go: nahoru, domu]

Skip to content

Commit

Permalink
Fix travis and README conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
honnibal committed Oct 18, 2016
2 parents c6a06c0 + 818dc83 commit ae29b9b
Show file tree
Hide file tree
Showing 95 changed files with 5,101 additions and 765 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ tmp/
.eggs
*.tgz
.sass-cache
.python-version

MANIFEST

Expand Down Expand Up @@ -36,6 +37,7 @@ data/en/strings

_build/
.env/
tmp/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
28 changes: 12 additions & 16 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,34 +1,30 @@
language: python

sudo: required
sudo: false
dist: trusty
group: edge

python:
- "2.7"
- "3.4"
- "3.5"

os:
- linux

env:
- VIA="compile"
- VIA="sdist"

install:
- "pip install -r requirements.txt"
- "pip install -e ."
- "mkdir -p corpora/en"
- "cd corpora/en"
- "wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz"
- "tar -xzf WordNet-3.0.tar.gz"
- "mv WordNet-3.0 wordnet"
- "cd ../../"
- "python bin/init_model.py en lang_data/ corpora/ data"
- "cp package.json data"
- "sputnik build data en_default.sputnik"
- "sputnik --name spacy install en_default.sputnik"
- "./travis.sh"

script:
- "pip install pytest"
- "python -m pytest spacy"

- if [[ "${VIA}" == "compile" ]]; then SPACY_DATA=models/en python -m pytest spacy; fi
- if [[ "${VIA}" == "pypi" ]]; then python -m pytest `python -c "import pathlib; import spacy; print(pathlib.Path(spacy.__file__).parent.resolve())"`; fi
- if [[ "${VIA}" == "sdist" ]]; then python -m pytest `python -c "import pathlib; import spacy; print(pathlib.Path(spacy.__file__).parent.resolve())"`; fi

notifications:
slack:
secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
email: false
22 changes: 22 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ OS X ships with Python and git preinstalled.

Windows
-------
<<<<<<< HEAD

Install a version of Visual Studio Express or higher that matches the version
that was used to compile your Python interpreter. For official distributions
Expand All @@ -211,6 +212,27 @@ Python install. Run:
Run tests
=========

=======

Install a version of Visual Studio Express or higher that matches the version
that was used to compile your Python interpreter. For official distributions
these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and VS 2015 (Python 3.5).

Workaround for obsolete system Python
=====================================

If you're stuck using a system with an old version of Python, and you don't
have root access, we've prepared a bootstrap script to help you compile a local
Python install. Run:

.. code:: bash
curl https://raw.githubusercontent.com/spacy-io/gist/master/bootstrap_python_env.sh | bash && source .env/bin/activate
Run tests
=========

>>>>>>> v1.0.0-rc1
spaCy comes with an extensive test suite. First, find out where spaCy is
installed:

Expand Down
115 changes: 25 additions & 90 deletions bin/parser/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from spacy.syntax.util import Config
from spacy.gold import read_json_file
from spacy.gold import GoldParse
from spacy.gold import merge_sents

from spacy.scorer import Scorer

Expand Down Expand Up @@ -63,96 +64,24 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
scorer.score(tokens, gold, verbose=verbose)


def _merge_sents(sents):
m_deps = [[], [], [], [], [], []]
m_brackets = []
i = 0
for (ids, words, tags, heads, labels, ner), brackets in sents:
m_deps[0].extend(id_ + i for id_ in ids)
m_deps[1].extend(words)
m_deps[2].extend(tags)
m_deps[3].extend(head + i for head in heads)
m_deps[4].extend(labels)
m_deps[5].extend(ner)
m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
i += len(ids)
return [(m_deps, m_brackets)]


def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
beam_width=1, verbose=False,
use_orig_arc_eager=False, pseudoprojective=False):
dep_model_dir = path.join(model_dir, 'deps')
ner_model_dir = path.join(model_dir, 'ner')
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir):
shutil.rmtree(dep_model_dir)
if path.exists(ner_model_dir):
shutil.rmtree(ner_model_dir)
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(dep_model_dir)
os.mkdir(ner_model_dir)
os.mkdir(pos_model_dir)

if pseudoprojective:
# preprocess training data here before ArcEager.get_labels() is called
gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)

Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=ArcEager.get_labels(gold_tuples),
beam_width=beam_width,projectivize=pseudoprojective)
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
labels=BiluoPushDown.get_labels(gold_tuples),
beam_width=0)

if n_sents > 0:
gold_tuples = gold_tuples[:n_sents]

nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
for itn in range(n_iter):
scorer = Scorer()
format_str = '{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
with Language.train(model_dir, train_data,
tagger_cfg, parser_cfg, entity_cfg) as trainer:
loss = 0
for raw_text, sents in gold_tuples:
if gold_preproc:
raw_text = None
else:
sents = _merge_sents(sents)
for annot_tuples, ctnt in sents:
if len(annot_tuples[1]) == 1:
continue
score_model(scorer, nlp, raw_text, annot_tuples,
verbose=verbose if itn >= 2 else False)
if raw_text is None:
words = add_noise(annot_tuples[1], corruption_level)
tokens = nlp.tokenizer.tokens_from_list(words)
else:
raw_text = add_noise(raw_text, corruption_level)
tokens = nlp.tokenizer(raw_text)
nlp.tagger(tokens)
gold = GoldParse(tokens, annot_tuples)
if not gold.is_projective:
raise Exception("Non-projective sentence in training: %s" % annot_tuples[1])
loss += nlp.parser.train(tokens, gold)
nlp.entity.train(tokens, gold)
nlp.tagger.train(tokens, gold.tags)
random.shuffle(gold_tuples)
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
scorer.tags_acc,
scorer.token_acc))
print('end training')
nlp.end_training(model_dir)
print('done')
for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc,
augment_data=None)):
for doc, gold in epoch:
trainer.update(doc, gold)
dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
print(format_str.format(itn, loss, **dev_scores.scores))


def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None, cand_preproc=None):
nlp = Language(data_dir=model_dir)
nlp = Language(path=model_dir)
if nlp.lang == 'de':
nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
if beam_width is not None:
Expand All @@ -162,7 +91,7 @@ def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False
if gold_preproc:
raw_text = None
else:
sents = _merge_sents(sents)
sents = merge_sents(sents)
for annot_tuples, brackets in sents:
if raw_text is None:
tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
Expand Down Expand Up @@ -219,15 +148,21 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
)
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
parser_cfg = dict(locals())
tagger_cfg = dict(locals())
entity_cfg = dict(locals())

lang = spacy.util.get_lang_class(language)

parser_cfg['features'] = lang.Defaults.parser_features
entity_cfg['features'] = lang.Defaults.entity_features

if not eval_only:
gold_train = list(read_json_file(train_loc))
train(lang, gold_train, model_dir,
feat_set='basic' if not debug else 'debug',
gold_preproc=gold_preproc, n_sents=n_sents,
corruption_level=corruption_level, n_iter=n_iter,
verbose=verbose,pseudoprojective=pseudoprojective)
gold_dev = list(read_json_file(dev_loc))
train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
n_iter=n_iter)
if out_loc:
write_parses(lang, dev_loc, model_dir, out_loc)
scorer = evaluate(lang, list(read_json_file(dev_loc)),
Expand Down
63 changes: 63 additions & 0 deletions examples/training/train_ner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from __future__ import unicode_literals, print_function
import json
import pathlib
import random

import spacy
from spacy.pipeline import EntityRecognizer
from spacy.gold import GoldParse


def train_ner(nlp, train_data, entity_types):
ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
for itn in range(5):
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
doc = nlp.make_doc(raw_text)
gold = GoldParse(doc, entities=entity_offsets)
ner.update(doc, gold)
ner.model.end_training()
return ner


def main(model_dir=None):
if model_dir is not None:
model_dir = pathlb.Path(model_dir)
if not model_dir.exists():
model_dir.mkdir()
assert model_dir.isdir()

nlp = spacy.load('en', parser=False, entity=False, vectors=False)

train_data = [
(
'Who is Shaka Khan?',
[(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]
),
(
'I like London and Berlin.',
[(len('I like '), len('I like London'), 'LOC'),
(len('I like London and '), len('I like London and Berlin'), 'LOC')]
)
]
ner = train_ner(nlp, train_data, ['PERSON', 'LOC'])

doc = nlp.make_doc('Who is Shaka Khan?')
nlp.tagger(doc)
ner(doc)
for word in doc:
print(word.text, word.tag_, word.ent_type_, word.ent_iob)

if model_dir is not None:
with (model_dir / 'config.json').open('wb') as file_:
json.dump(ner.cfg, file_)
ner.model.dump(str(model_dir / 'model'))


if __name__ == '__main__':
main()
# Who "" 2
# is "" 2
# Shaka "" PERSON 3
# Khan "" PERSON 1
# ? "" 2
75 changes: 75 additions & 0 deletions examples/training/train_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from __future__ import unicode_literals, print_function
import json
import pathlib
import random

import spacy
from spacy.pipeline import DependencyParser
from spacy.gold import GoldParse
from spacy.tokens import Doc


def train_parser(nlp, train_data, left_labels, right_labels):
parser = DependencyParser(
nlp.vocab,
left_labels=left_labels,
right_labels=right_labels)
for itn in range(1000):
random.shuffle(train_data)
loss = 0
for words, heads, deps in train_data:
doc = Doc(nlp.vocab, words=words)
gold = GoldParse(doc, heads=heads, deps=deps)
loss += parser.update(doc, gold)
parser.model.end_training()
return parser


def main(model_dir=None):
if model_dir is not None:
model_dir = pathlb.Path(model_dir)
if not model_dir.exists():
model_dir.mkdir()
assert model_dir.isdir()

nlp = spacy.load('en', tagger=False, parser=False, entity=False, vectors=False)

train_data = [
(
['They', 'trade', 'mortgage', '-', 'backed', 'securities', '.'],
[1, 1, 4, 4, 5, 1, 1],
['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
),
(
['I', 'like', 'London', 'and', 'Berlin', '.'],
[1, 1, 1, 2, 2, 1],
['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
)
]
left_labels = set()
right_labels = set()
for _, heads, deps in train_data:
for i, (head, dep) in enumerate(zip(heads, deps)):
if i < head:
left_labels.add(dep)
elif i > head:
right_labels.add(dep)
parser = train_parser(nlp, train_data, sorted(left_labels), sorted(right_labels))

doc = Doc(nlp.vocab, words=['I', 'like', 'securities', '.'])
parser(doc)
for word in doc:
print(word.text, word.dep_, word.head.text)

if model_dir is not None:
with (model_dir / 'config.json').open('wb') as file_:
json.dump(parser.cfg, file_)
parser.model.dump(str(model_dir / 'model'))


if __name__ == '__main__':
main()
# I nsubj like
# like ROOT like
# securities dobj like
# . cc securities
Loading

0 comments on commit ae29b9b

Please sign in to comment.