Fix travis and README conflicts

medianeuroscience · Oct 18, 2016 · ae29b9b · ae29b9b
2 parents c6a06c0 + 818dc83
commit ae29b9b
Show file tree

Hide file tree

Showing 95 changed files with 5,101 additions and 765 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,7 @@ tmp/
 .eggs
 *.tgz
 .sass-cache
+.python-version
 
 MANIFEST
 
@@ -36,6 +37,7 @@ data/en/strings
 
 _build/
 .env/
+tmp/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/.travis.yml b/.travis.yml
@@ -1,34 +1,30 @@
 language: python
 
-sudo: required
+sudo: false
 dist: trusty
 group: edge
 
 python:
    - "2.7"
-   - "3.4"
+   - "3.5"
 
 os:
   - linux
 
+env:
+  - VIA="compile"
+  - VIA="sdist"
+
 install:
-  - "pip install -r requirements.txt"
-  - "pip install -e ."
-  - "mkdir -p corpora/en"
-  - "cd corpora/en"
-  - "wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz"
-  - "tar -xzf WordNet-3.0.tar.gz"
-  - "mv WordNet-3.0 wordnet"
-  - "cd ../../"
-  - "python bin/init_model.py en lang_data/ corpora/ data"
-  - "cp package.json data"
-  - "sputnik build data en_default.sputnik"
-  - "sputnik --name spacy install en_default.sputnik"
+  - "./travis.sh"
 
 script:
   - "pip install pytest"
-  - "python -m pytest spacy"
-
+  - if [[ "${VIA}" == "compile" ]]; then SPACY_DATA=models/en python -m pytest spacy; fi
+  - if [[ "${VIA}" == "pypi" ]]; then python -m pytest `python -c "import pathlib; import spacy; print(pathlib.Path(spacy.__file__).parent.resolve())"`; fi
+  - if [[ "${VIA}" == "sdist" ]]; then python -m pytest `python -c "import pathlib; import spacy; print(pathlib.Path(spacy.__file__).parent.resolve())"`; fi
+
 notifications:
   slack:
     secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ=
+  email: false
diff --git a/README.rst b/README.rst
@@ -192,6 +192,7 @@ OS X ships with Python and git preinstalled.
 
 Windows
 -------
+<<<<<<< HEAD
 
 Install a version of Visual Studio Express or higher that matches the version 
 that was used to compile your Python interpreter. For official distributions 
@@ -211,6 +212,27 @@ Python install. Run:
 Run tests
 =========
 
+=======
+
+Install a version of Visual Studio Express or higher that matches the version 
+that was used to compile your Python interpreter. For official distributions 
+these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and VS 2015 (Python 3.5).
+
+Workaround for obsolete system Python
+=====================================
+
+If you're stuck using a system with an old version of Python, and you don't 
+have root access, we've prepared a bootstrap script to help you compile a local 
+Python install. Run:
+
+.. code:: bash
+
+    curl https://raw.githubusercontent.com/spacy-io/gist/master/bootstrap_python_env.sh | bash && source .env/bin/activate
+
+Run tests
+=========
+
+>>>>>>> v1.0.0-rc1
 spaCy comes with an extensive test suite. First, find out where spaCy is 
 installed:
 

diff --git a/bin/parser/train.py b/bin/parser/train.py
@@ -17,6 +17,7 @@
 from spacy.syntax.util import Config
 from spacy.gold import read_json_file
 from spacy.gold import GoldParse
+from spacy.gold import merge_sents
 
 from spacy.scorer import Scorer
 
@@ -63,96 +64,24 @@ def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False):
     scorer.score(tokens, gold, verbose=verbose)
 
 
-def _merge_sents(sents):
-    m_deps = [[], [], [], [], [], []]
-    m_brackets = []
-    i = 0
-    for (ids, words, tags, heads, labels, ner), brackets in sents:
-        m_deps[0].extend(id_ + i for id_ in ids)
-        m_deps[1].extend(words)
-        m_deps[2].extend(tags)
-        m_deps[3].extend(head + i for head in heads)
-        m_deps[4].extend(labels)
-        m_deps[5].extend(ner)
-        m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets)
-        i += len(ids)
-    return [(m_deps, m_brackets)]
-
-
-def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
-          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
-          beam_width=1, verbose=False,
-          use_orig_arc_eager=False, pseudoprojective=False):
-    dep_model_dir = path.join(model_dir, 'deps')
-    ner_model_dir = path.join(model_dir, 'ner')
-    pos_model_dir = path.join(model_dir, 'pos')
-    if path.exists(dep_model_dir):
-        shutil.rmtree(dep_model_dir)
-    if path.exists(ner_model_dir):
-        shutil.rmtree(ner_model_dir)
-    if path.exists(pos_model_dir):
-        shutil.rmtree(pos_model_dir)
-    os.mkdir(dep_model_dir)
-    os.mkdir(ner_model_dir)
-    os.mkdir(pos_model_dir)
-
-    if pseudoprojective:
-        # preprocess training data here before ArcEager.get_labels() is called
-        gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
-
-    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
-                 labels=ArcEager.get_labels(gold_tuples),
-                 beam_width=beam_width,projectivize=pseudoprojective)
-    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
-                 labels=BiluoPushDown.get_labels(gold_tuples),
-                 beam_width=0)
-
-    if n_sents > 0:
-        gold_tuples = gold_tuples[:n_sents]
-
-    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
-    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
-    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
-    nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
+def train(Language, train_data, dev_data, model_dir, tagger_cfg, parser_cfg, entity_cfg,
+        n_iter=15, seed=0, gold_preproc=False, n_sents=0, corruption_level=0):
     print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
-    for itn in range(n_iter):
-        scorer = Scorer()
+    format_str = '{:d}\t{:d}\t{uas:.3f}\t{ents_f:.3f}\t{tags_acc:.3f}\t{token_acc:.3f}'
+    with Language.train(model_dir, train_data,
+            tagger_cfg, parser_cfg, entity_cfg) as trainer:
         loss = 0
-        for raw_text, sents in gold_tuples:
-            if gold_preproc:
-                raw_text = None
-            else:
-                sents = _merge_sents(sents)
-            for annot_tuples, ctnt in sents:
-                if len(annot_tuples[1]) == 1:
-                    continue
-                score_model(scorer, nlp, raw_text, annot_tuples,
-                            verbose=verbose if itn >= 2 else False)
-                if raw_text is None:
-                    words = add_noise(annot_tuples[1], corruption_level)
-                    tokens = nlp.tokenizer.tokens_from_list(words)
-                else:
-                    raw_text = add_noise(raw_text, corruption_level)
-                    tokens = nlp.tokenizer(raw_text)
-                nlp.tagger(tokens)
-                gold = GoldParse(tokens, annot_tuples)
-                if not gold.is_projective:
-                    raise Exception("Non-projective sentence in training: %s" % annot_tuples[1])
-                loss += nlp.parser.train(tokens, gold)
-                nlp.entity.train(tokens, gold)
-                nlp.tagger.train(tokens, gold.tags)
-        random.shuffle(gold_tuples)
-        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
-                                                   scorer.tags_acc,
-                                                   scorer.token_acc))
-    print('end training')
-    nlp.end_training(model_dir)
-    print('done')
+        for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=gold_preproc,
+                                                   augment_data=None)):
+            for doc, gold in epoch:
+                trainer.update(doc, gold)
+            dev_scores = trainer.evaluate(dev_data, gold_preproc=gold_preproc)
+            print(format_str.format(itn, loss, **dev_scores.scores))
 
 
 def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
              beam_width=None, cand_preproc=None):
-    nlp = Language(data_dir=model_dir)
+    nlp = Language(path=model_dir)
     if nlp.lang == 'de':
         nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
     if beam_width is not None:
@@ -162,7 +91,7 @@ def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False
         if gold_preproc:
             raw_text = None
         else:
-            sents = _merge_sents(sents)
+            sents = merge_sents(sents)
         for annot_tuples, brackets in sents:
             if raw_text is None:
                 tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
@@ -219,15 +148,21 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
 )
 def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
          debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
+    parser_cfg = dict(locals())
+    tagger_cfg = dict(locals())
+    entity_cfg = dict(locals())
+
     lang = spacy.util.get_lang_class(language)
+
+    parser_cfg['features'] = lang.Defaults.parser_features
+    entity_cfg['features'] = lang.Defaults.entity_features
 
     if not eval_only:
         gold_train = list(read_json_file(train_loc))
-        train(lang, gold_train, model_dir,
-              feat_set='basic' if not debug else 'debug',
-              gold_preproc=gold_preproc, n_sents=n_sents,
-              corruption_level=corruption_level, n_iter=n_iter,
-              verbose=verbose,pseudoprojective=pseudoprojective)
+        gold_dev = list(read_json_file(dev_loc))
+        train(lang, gold_train, gold_dev, model_dir, tagger_cfg, parser_cfg, entity_cfg,
+              n_sents=n_sents, gold_preproc=gold_preproc, corruption_level=corruption_level,
+              n_iter=n_iter)
     if out_loc:
         write_parses(lang, dev_loc, model_dir, out_loc)
     scorer = evaluate(lang, list(read_json_file(dev_loc)),

diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
@@ -0,0 +1,63 @@
+from __future__ import unicode_literals, print_function
+import json
+import pathlib
+import random
+
+import spacy
+from spacy.pipeline import EntityRecognizer
+from spacy.gold import GoldParse
+
+
+def train_ner(nlp, train_data, entity_types):
+    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
+    for itn in range(5):
+        random.shuffle(train_data)
+        for raw_text, entity_offsets in train_data:
+            doc = nlp.make_doc(raw_text)
+            gold = GoldParse(doc, entities=entity_offsets)
+            ner.update(doc, gold)
+    ner.model.end_training()
+    return ner
+
+
+def main(model_dir=None):
+    if model_dir is not None:
+        model_dir = pathlb.Path(model_dir)
+        if not model_dir.exists():
+            model_dir.mkdir()
+        assert model_dir.isdir()
+
+    nlp = spacy.load('en', parser=False, entity=False, vectors=False)
+
+    train_data = [
+        (
+            'Who is Shaka Khan?',
+            [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]
+        ),
+        (
+            'I like London and Berlin.',
+            [(len('I like '), len('I like London'), 'LOC'),
+            (len('I like London and '), len('I like London and Berlin'), 'LOC')]
+        )
+    ]
+    ner = train_ner(nlp, train_data, ['PERSON', 'LOC'])
+
+    doc = nlp.make_doc('Who is Shaka Khan?')
+    nlp.tagger(doc)
+    ner(doc)
+    for word in doc:
+        print(word.text, word.tag_, word.ent_type_, word.ent_iob)
+
+    if model_dir is not None:
+        with (model_dir / 'config.json').open('wb') as file_:
+            json.dump(ner.cfg, file_)
+        ner.model.dump(str(model_dir / 'model'))
+
+
+if __name__ == '__main__':
+    main()
+    # Who "" 2
+    # is "" 2
+    # Shaka "" PERSON 3
+    # Khan "" PERSON 1
+    # ? "" 2
diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py
@@ -0,0 +1,75 @@
+from __future__ import unicode_literals, print_function
+import json
+import pathlib
+import random
+
+import spacy
+from spacy.pipeline import DependencyParser
+from spacy.gold import GoldParse
+from spacy.tokens import Doc
+
+
+def train_parser(nlp, train_data, left_labels, right_labels):
+    parser = DependencyParser(
+                nlp.vocab,
+                left_labels=left_labels,
+                right_labels=right_labels)
+    for itn in range(1000):
+        random.shuffle(train_data)
+        loss = 0
+        for words, heads, deps in train_data:
+            doc = Doc(nlp.vocab, words=words)
+            gold = GoldParse(doc, heads=heads, deps=deps)
+            loss += parser.update(doc, gold)
+    parser.model.end_training()
+    return parser
+
+
+def main(model_dir=None):
+    if model_dir is not None:
+        model_dir = pathlb.Path(model_dir)
+        if not model_dir.exists():
+            model_dir.mkdir()
+        assert model_dir.isdir()
+
+    nlp = spacy.load('en', tagger=False, parser=False, entity=False, vectors=False)
+
+    train_data = [
+        (
+            ['They', 'trade',  'mortgage', '-', 'backed', 'securities', '.'],
+            [1, 1, 4, 4, 5, 1, 1],
+            ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
+        ),
+        (
+            ['I', 'like', 'London', 'and', 'Berlin', '.'],
+            [1, 1, 1, 2, 2, 1],
+            ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
+        )
+    ]
+    left_labels = set()
+    right_labels = set()
+    for _, heads, deps in train_data:
+        for i, (head, dep) in enumerate(zip(heads, deps)):
+            if i < head:
+                left_labels.add(dep)
+            elif i > head:
+                right_labels.add(dep)
+    parser = train_parser(nlp, train_data, sorted(left_labels), sorted(right_labels))
+
+    doc = Doc(nlp.vocab, words=['I', 'like', 'securities', '.'])
+    parser(doc)
+    for word in doc:
+        print(word.text, word.dep_, word.head.text)
+
+    if model_dir is not None:
+        with (model_dir / 'config.json').open('wb') as file_:
+            json.dump(parser.cfg, file_)
+        parser.model.dump(str(model_dir / 'model'))
+
+
+if __name__ == '__main__':
+    main()
+    # I nsubj like
+    # like ROOT like
+    # securities dobj like
+    # . cc securities