[go: nahoru, domu]

Skip to content
This repository has been archived by the owner on Apr 18, 2022. It is now read-only.

Commit

Permalink
Working on improving Tischendorf importer to use new models; still ne…
Browse files Browse the repository at this point in the history
…eding to generate id
  • Loading branch information
westonruter committed Jul 24, 2010
1 parent 77307e2 commit 54f8058
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 69 deletions.
76 changes: 38 additions & 38 deletions apps/importers/management/commands/load_tischendorf.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,16 +63,16 @@
from django.core.management.base import BaseCommand

from core import osis
from texts.management.import_helpers import abort_if_imported
from texts.management.import_helpers import close_structure
from texts.management.import_helpers import delete_work
from texts.management.import_helpers import download_resource
from importers.management.import_helpers import abort_if_imported
from importers.management.import_helpers import close_structure
from importers.management.import_helpers import delete_work
from importers.management.import_helpers import download_resource
from texts.models import Work, Token, Structure, WorkServer
from core.models import Language, License, Server


# TODO: Some of this might be better defined as SETTING
SOURCE_URL = "http://files.morphgnt.org/tischendorf/Tischendorf-2.5.zip"
SOURCE_URL = "http://files.morphgnt.org/tischendorf/Tischendorf-2.6.zip"

WORK1_ID = 1 # Tischendorf Kethiv
WORK1_VARIANT_BIT = 0b00000001
Expand Down Expand Up @@ -157,11 +157,11 @@ def create_works(self):
# Work for Kethiv edition (base text for qere)
work1 = Work(
id = WORK1_ID,
title = "Tischendorf 8th ed. v2.5",
title = "Tischendorf 8th ed. v2.6 Kethiv",
language = Language('grc'),
type = 'Bible',
osis_slug = 'Tischendorf',
publish_date = datetime.date(2009, 5, 29),
publish_date = datetime.date(2010, 7, 4),
import_date = datetime.datetime.now(),
variant_bit = WORK1_VARIANT_BIT,
creator = "<a href='http://en.wikipedia.org/wiki/Constantin_von_Tischendorf' title='Constantin von Tischendorf @ Wikipedia'>Constantin von Tischendorf</a>. Based on G. Clint Yale's Tischendorf text and on Dr. Maurice A. Robinson's Public Domain Westcott-Hort text. Edited by <a href='http://www.hum.aau.dk/~ulrikp/'>Ulrik Sandborg-Petersen</a>.",
Expand All @@ -177,11 +177,11 @@ def create_works(self):
# Work for Qere edition (Kethiv is base text)
work2 = Work(
id = WORK2_ID,
title = "Tischendorf 8th ed. v2.5 (Corrected)",
title = "Tischendorf 8th ed. v2.6 Qere (Corrected)",
language = Language('grc'),
type = 'Bible',
osis_slug = 'TischendorfCorrected',
publish_date = datetime.date(2009, 5, 29),
publish_date = datetime.date(2010, 7, 4),
import_date = datetime.datetime.now(),
variant_bit = WORK2_VARIANT_BIT,
variants_for_work = work1,
Expand Down Expand Up @@ -229,14 +229,14 @@ def handle(self, *args, **options):

# Set up the book ref
structs = {}
structs[Structure.BOOK] = Structure(
structs['book'] = Structure(
work = work1,
type = Structure.BOOK,
element = 'book',
osis_id = book_code,
position = structCount,
numerical_start = book_codes.index(book_code),
variant_bits = WORK2_VARIANT_BIT | WORK1_VARIANT_BIT,
source_url = "zip:" + SOURCE_URL + "!/Tischendorf-2.5/Unicode/" + BOOK_FILENAME_LOOKUP[book_code]
source_url = "zip:" + SOURCE_URL + "!/Tischendorf-2.6/Unicode/" + BOOK_FILENAME_LOOKUP[book_code]
#title = osis.BOOK_NAMES["Bible"][book_code]
)

Expand All @@ -246,7 +246,7 @@ def handle(self, *args, **options):
current_verse = None
lineNumber = -1

for line in StringIO.StringIO(_zip.read("Tischendorf-2.5/Unicode/" + BOOK_FILENAME_LOOKUP[book_code])):
for line in StringIO.StringIO(_zip.read("Tischendorf-2.6/Unicode/" + BOOK_FILENAME_LOOKUP[book_code])):
lineNumber += 1
lineMatches = LINE_PARSER.match(unicodedata.normalize("NFC", unicode(line, 'utf-8')))
if lineMatches is None:
Expand All @@ -261,42 +261,42 @@ def handle(self, *args, **options):
# New Chapter start
if lineMatches.group('chapter') != current_chapter:
# End the previous chapter
close_structure(Structure.CHAPTER, bookTokens, structs)
close_structure('chapter', bookTokens, structs)

# Start the next chapter
current_chapter = lineMatches.group('chapter')
structs[Structure.CHAPTER] = Structure(
structs['chapter'] = Structure(
work = work1, # remember work2 is subsumed by work1
type = Structure.CHAPTER,
element = 'chapter',
position = structCount,
osis_id = book_code + "." + current_chapter,
numerical_start = current_chapter,
variant_bits = WORK2_VARIANT_BIT | WORK1_VARIANT_BIT
)
print(structs[Structure.CHAPTER].osis_id)
print structs['chapter'].osis_id
structCount += 1

# New Verse start
if lineMatches.group('verse') != current_verse:
# End the previous verse
close_structure(Structure.VERSE, bookTokens, structs)
close_structure('verse', bookTokens, structs)

# Start the next verse
current_verse = lineMatches.group('verse')
structs[Structure.VERSE] = Structure(
structs['verse'] = Structure(
work = work1, # remember work2 is subsumed by work1
type = Structure.VERSE,
element = 'verse',
position = structCount,
osis_id = book_code + "." + current_chapter + "." + current_verse,
numerical_start = current_verse,
variant_bits = WORK2_VARIANT_BIT | WORK1_VARIANT_BIT
)
print(structs[Structure.VERSE].osis_id)
print structs['verse'].osis_id
structCount += 1

# End paragraph
paragraph_marker = None
if lineMatches.group('break') == 'P' and structs.has_key(Structure.PARAGRAPH):
if lineMatches.group('break') == 'P' and structs.has_key('p'):
assert(len(bookTokens) > 0)

paragraph_marker = Token(
Expand All @@ -308,22 +308,22 @@ def handle(self, *args, **options):
)
tokenCount += 1
paragraph_marker.save()
structs[Structure.PARAGRAPH].end_marker = paragraph_marker
close_structure(Structure.PARAGRAPH, bookTokens, structs)
structs['p'].end_marker = paragraph_marker
close_structure('p', bookTokens, structs)
bookTokens.append(paragraph_marker)

# Start paragraph
if len(bookTokens) == 0 or lineMatches.group('break') == 'P':
assert(not structs.has_key(Structure.PARAGRAPH))
assert(not structs.has_key('p'))
print("¶")
structs[Structure.PARAGRAPH] = Structure(
structs['p'] = Structure(
work = work1, # remember work2 is subsumed by work1
type = Structure.PARAGRAPH,
element = 'p',
position = structCount,
variant_bits = WORK2_VARIANT_BIT | WORK1_VARIANT_BIT
)
if paragraph_marker:
structs[Structure.PARAGRAPH].start_marker = paragraph_marker
structs['p'].start_marker = paragraph_marker
structCount += 1

# Insert whitespace
Expand Down Expand Up @@ -352,7 +352,7 @@ def handle(self, *args, **options):
# Open UNCERTAIN1 bracket
assert(lineMatches.group('kethivStartBracket') == lineMatches.group('qereStartBracket'))
if lineMatches.group('kethivStartBracket'):
assert(not structs.has_key(Structure.UNCERTAIN1))
assert(not structs.has_key('doubted'))
print("### OPEN BRACKET")

# Make start_marker for UNCERTAIN1
Expand All @@ -367,9 +367,9 @@ def handle(self, *args, **options):
lineTokens.append(open_bracket_token)

# Create the UNCERTAIN1 structure
structs[Structure.UNCERTAIN1] = Structure(
structs['doubted'] = Structure(
work = work1, # remember work2 is subsumed by work1
type = Structure.UNCERTAIN1,
element = 'doubted',
position = structCount,
variant_bits = WORK2_VARIANT_BIT | WORK1_VARIANT_BIT,
start_marker = open_bracket_token
Expand All @@ -395,7 +395,7 @@ def handle(self, *args, **options):

# Make this token the start of the UNCERTAIN structure
if lineMatches.group('kethivStartBracket'):
structs[Structure.UNCERTAIN1].start_token = token_work1
structs['doubted'].start_token = token_work1

# Qere token
if lineMatches.group('kethiv') != lineMatches.group('qere'):
Expand Down Expand Up @@ -433,10 +433,10 @@ def handle(self, *args, **options):
# Close UNCERTAIN1 bracket
assert(lineMatches.group('kethivEndBracket') == lineMatches.group('qereEndBracket'))
if lineMatches.group('kethivEndBracket'):
assert(structs.has_key(Structure.UNCERTAIN1))
assert(structs.has_key('doubted'))
print("### CLOSE BRACKET")

structs[Structure.UNCERTAIN1].end_token = lineTokens[-1]
structs['doubted'].end_token = lineTokens[-1]

# Make end_marker for UNCERTAIN1
close_bracket_token = Token(
Expand All @@ -450,8 +450,8 @@ def handle(self, *args, **options):
close_bracket_token.save()

# Close the UNCERTAIN1 structure
structs[Structure.UNCERTAIN1].end_marker = close_bracket_token
close_structure(Structure.UNCERTAIN1, bookTokens, structs)
structs['doubted'].end_marker = close_bracket_token
close_structure('doubted', bookTokens, structs)
lineTokens.append(open_bracket_token)

# Set the start_token for each structure that isn't set
Expand All @@ -462,8 +462,8 @@ def handle(self, *args, **options):
for token in lineTokens:
bookTokens.append(token)

for structType in structs.keys():
close_structure(structType, bookTokens, structs)
for structElement in structs.keys():
close_structure(structElement, bookTokens, structs)

print("structCount: %s" % str(structCount))
print("tokenCount: %s" % str(tokenCount))
14 changes: 7 additions & 7 deletions apps/importers/management/import_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ def get_book_code_args():
book_codes.append(arg)
return book_codes

def close_structure(type, bookTokens, structs):
if structs.has_key(type):
assert(structs[type].start_token is not None)
if structs[type].end_token is None:
structs[type].end_token = bookTokens[-1]
structs[type].save()
del structs[type]
def close_structure(element, bookTokens, structs):
if structs.has_key(element):
assert(structs[element].start_token is not None)
if structs[element].end_token is None:
structs[element].end_token = bookTokens[-1]
structs[element].save()
del structs[element]

6 changes: 2 additions & 4 deletions apps/texts/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,6 @@ class Token(models.Model):
"""

id = models.CharField(_("base32 sha-256 hash of [workID, passage, n-gram token context]"), max_length=52, primary_key=True)

# TODO: id needs to be a CharField for storing the Base32 hash
data = models.CharField(_("Unicode data in Normalization Form C (NFC)"), max_length=255, db_index=True)

WORD = 1
Expand Down Expand Up @@ -409,7 +407,7 @@ class Structure(models.Model):
# Note: attributes is a related_name for StructureAttribute
# Question: what about using XMLField? Or storing attributes via GeoDjango.DictionaryField

#osis_id = models.CharField(max_length=32, blank=True, db_index=True) # moved to StructureAttribute
osis_id = models.CharField(max_length=32, blank=True, db_index=True) # the one attribute moved to StructureAttribute
work = models.ForeignKey(Work, help_text=_("Must be same as start/end_*_token.work. Must not be a variant work; use the variant_bits to select for it"))
variant_bits = models.PositiveSmallIntegerField(default=0b00000001, help_text=_("Bitwise anded with Work.variant_bit to determine if belongs to work."))

Expand Down Expand Up @@ -484,7 +482,7 @@ def tokens(self, include_outside_markers = False, variant_bits = None):
class Meta:
ordering = ['position'] #, 'variant_number'
unique_together = (
('type', 'position', 'start_token'), #???
('element', 'position', 'start_token'), #???
)

def __unicode__(self):
Expand Down
40 changes: 20 additions & 20 deletions apps/texts/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,11 @@ def passage(request, osis_ref):
# Get the desired hierarchy (serialization order) for Structures
structure_types = []
structure_types_always_milestoned = {}
STRUCTURE_TYPE_CODES = {}
#STRUCTURE_TYPE_CODES = {}
structure_type_hierarchy = []
for choice_tuple in Structure.TYPE_CHOICES: #TODO: There's gotta be a better way to do reverse lookup of choices tuples
STRUCTURE_TYPE_CODES[choice_tuple[1]] = choice_tuple[0]
structure_types.append(choice_tuple[0])
#for choice_tuple in Structure.TYPE_CHOICES: #TODO: There's gotta be a better way to do reverse lookup of choices tuples
# STRUCTURE_TYPE_CODES[choice_tuple[1]] = choice_tuple[0]
# structure_types.append(choice_tuple[0])


is_standoff = False
Expand All @@ -83,24 +83,24 @@ def passage(request, osis_ref):
# Predefined hierarchy: Book-Chapter-Verse
if request.GET["hierarchy"] == 'bcv':
structure_type_hierarchy = [
Structure.BOOK_GROUP,
Structure.BOOK,
Structure.CHAPTER,
Structure.VERSE,
'bookGroup',
'book',
'chapter',
'verse',
]
structure_types_always_milestoned[Structure.PARAGRAPH] = True
structure_types_always_milestoned['p'] = True

# Predefined hierarchy: Book-Section-Paragraph
elif request.GET["hierarchy"] == 'bsp':
structure_type_hierarchy = [
Structure.BOOK_GROUP,
Structure.BOOK,
Structure.SECTION,
Structure.PARAGRAPH,
Structure.LINE
'bookGroup',
'book',
'section',
'p',
'l'
]
structure_types_always_milestoned[Structure.VERSE] = True
structure_types_always_milestoned[Structure.CHAPTER] = True
structure_types_always_milestoned['verse'] = True
structure_types_always_milestoned['chapter'] = True

# Predefined hierarchy: Book-Section-Paragraph
elif request.GET["hierarchy"] == 'milestone':
Expand All @@ -113,11 +113,11 @@ def passage(request, osis_ref):
always_milestone = struct_type.startswith('~')
if always_milestone:
struct_type = struct_type[1:]
structure_types_always_milestoned[STRUCTURE_TYPE_CODES[struct_type]] = True
structure_types_always_milestoned[struct_type] = True

if not STRUCTURE_TYPE_CODES.has_key(struct_type):
return HttpResponseBadRequest("Unexpected structure type '%s' provided for hieararchy" % struct_type, mimetype = "text/plain")
structure_type_hierarchy.append(STRUCTURE_TYPE_CODES[struct_type])
#if not STRUCTURE_TYPE_CODES.has_key(struct_type):
# return HttpResponseBadRequest("Unexpected structure type '%s' provided for hieararchy" % struct_type, mimetype = "text/plain")
structure_type_hierarchy.append(struct_type)

# Append any remaining in the order they are defined
for struct_type in structure_types:
Expand Down
1 change: 1 addition & 0 deletions settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@

# project
"core",
"importers",
"texts",
# "morphs"
]
Expand Down

0 comments on commit 54f8058

Please sign in to comment.