import regex as re import html import os from pkg_resources import resource_string from enum import Enum, auto from .name_parser_api.parsedname import ParsedName, State, NamePart from .name_parser_api.util import rankutils from .name_parser_api.rank import Rank, NomCode from .name_parser_api.authorship import Authorship def stripToNone(s): """ strips and return None if string is empty """ if s is None: return s = s.strip() if s == '': return None return s class UnparsableNameException(Exception): def __init__(self, type_, name): super().__init__("Unparsable {} name: {}".format(type_, name)) self.type = type_ self.name = name class NameType(Enum): """ A short classification of scientific name strings used in Checklist Bank. """ # A scientific latin name that might contain authorship but is not any of the other name types below (virus, hybrid, cultivar, etc). SCIENTIFIC = auto() # a virus name VIRUS = auto() # A hybrid formula (not a hybrid name). HYBRID_FORMULA = auto() """ A variation of a scientific name that either adds additional notes or has some shortcomings to be classified as regular scientific names. Frequent reasons are: - informal addition like "cf." - indetermined like "Abies spec." - abbreviated genus "A. alba Mill """ INFORMAL = auto() """ Operational Taxonomic Unit. An OTU is a pragmatic definition to group individuals by similarity, equivalent to but not necessarily in line with classical Linnaean taxonomy or modern Evolutionary taxonomy. A OTU usually refers to clusters of organisms, grouped by DNA sequence similarity of a specific taxonomic marker gene. In other words, OTUs are pragmatic proxies for "species" at different taxonomic levels. Sequences can be clustered according to their similarity to one another, and operational taxonomic units are defined based on the similarity threshold (usually 97% similarity) set by the researcher. Typically, OTU's are based on similar 16S rRNA sequences. """ OTU = auto() # A placeholder name like "incertae sedis" or "unknown genus". PLACEHOLDER = auto() # Surely not a scientific name of any kind. NO_NAME = auto() def isParsable(self): return self == self.SCIENTIFIC or self == self.INFORMAL class Warnings: NULL_EPITHET = "epithet with literal value null" UNUSUAL_CHARACTERS = "unusual characters" SUBSPECIES_ASSIGNED = "Name was considered species but contains infraspecific epithet" LC_MONOMIAL = "lower case monomial match" INDETERMINED = "indetermined name missing its terminal epithet" HIGHER_RANK_BINOMIAL = "binomial with rank higher than species aggregate" QUESTION_MARKS_REMOVED = "question marks removed" REPL_ENCLOSING_QUOTE = "removed enclosing quotes" MISSING_GENUS = "epithet without genus" RANK_MISMATCH = "rank does not fit the parsed name" HTML_ENTITIES = "html entities unescaped" XML_TAGS = "xml tags removed" class TaxonParser: """ Core parser class of the name parser that tries to take a clean name into its pieces by using regular expressions. Fully parse the supplied name also trying to extract authorships, a conceptual sec reference, remarks or notes on the nomenclatural status. In some cases the authorship parsing proves impossible and this nameparser will return null. For strings which are no scientific names and scientific names that cannot be expressed by the ParsedName class the parser will throw an UnparsableException with a given NameType and the original, unparsed name. This is the case for all virus names and proper hybrid formulas, so make sure you catch and process this exception. """ class Latin_endings_pattern: def __init__(self): endings = resource_string(__name__, "resources/latin-endings.txt").decode().split("\n") self.pattern = re.compile("(" + "|".join(endings) + ")$") AUTHORTEAM_DELIMITER = re.compile("[,&]") AUTHOR_INITIAL_SWAP = re.compile("^([^,]+) *, *([^,]+)$") NORM_EX_HORT = re.compile(r"\b(?:hort(?:usa?)?|cv)[. ]ex ", re.I) # name parsing NAME_LETTERS = "A-ZÏËÖÜÄÉÈČÁÀÆŒ" name_letters = "a-zïëöüäåéèčáàæœ" AUTHOR_LETTERS = NAME_LETTERS + r"\p{Lu}" # upper case unicode letter, not numerical # (\W is alphanum) author_letters = name_letters + r"\p{Ll}-?" # lower case unicode letter, not numerical # common 3 char or longer name suffices AUTHOR_TOKEN_3 = "fil|filius|hort|jun|junior|sen|senior" # common name suffices (ms=manuscript, not yet published) AUTHOR_TOKEN = (r"(?:\p{Lu}[\p{Lu}\p{Ll}'-]*" + "|" + AUTHOR_TOKEN_3 + "|al|f|j|jr|ms|sr|v|v[ao]n|bis|d[aeiou]?|de[nrmls]?|degli|e|l[ae]s?|s|ter|'?t|y" + r")\.?") AUTHOR = AUTHOR_TOKEN + "(?:[ '-]?" + AUTHOR_TOKEN + ")*" AUTHOR_TEAM = AUTHOR + "(?:[&,;]+" + AUTHOR + ")*" AUTHORSHIP = ( "(?:(" + AUTHOR_TEAM + r") ?\bex[. ])?" + "(" + AUTHOR_TEAM + ")" + r"(?: *: *(Pers\.?|Fr\.?))?" # 2 well known sanction authors for fungus, see POR-2454 ) AUTHOR_TEAM_PATTERN = re.compile("^" + AUTHOR_TEAM + "$") YEAR = "[12][0-9][0-9][0-9?]" YEAR_LOOSE = YEAR + r"[abcdh?]?(?:[/,-][0-9]{1,4})?" NOTHO = "notho" RANK_MARKER = ("(?:" + NOTHO + ")?(?:(?") STARTING_EPITHET = re.compile(r"^\s*(" + EPHITHET + r")\b") FORM_SPECIALIS = re.compile(r"\bf\. *sp(?:ec)?\b") SENSU_LATU = re.compile(r"\bs\.l\.\b") # many names still use outdated xxxtype rank marker, e.g. serotype instead of serovar TYPE_TO_VAR = re.compile(r"\b(" + "|".join(r.name.lower()[:-2] for r in rankutils.INFRASUBSPECIFIC_MICROBIAL_RANKS) \ + r")type\b") POTENTIAL_NAME_PATTERN = re.compile("^×?" + MONOMIAL + r"\b") REMOVE_INTER_RANKS = re.compile(r"\b((?:subsp|ssp|var)[ .].+)\b(" + RANK_MARKER + r")\b") # allow only short lower case tokens to avoid matching to a real epithet SKIP_AUTHORS = r"(?:\b[ \p{Ll}'(-]{0,3}\p{Lu}.*?\b)??"; NAME_PATTERN = re.compile( "^" # #1 genus/monomial + r"(×?(?:\?|" + MONOMIAL + "))" # #2 or #4 subgenus/section with #3 infrageneric rank marker + "(?:(? 0: self.pn.type = NameType.PLACEHOLDER name, n = self.REMOVE_PLACEHOLDER_INFRAGENERIC.subn("", name, 1) if n > 0: self.pn.type = NameType.PLACEHOLDER # resolve parsable names with a placeholder genus only name, n = self.PLACEHOLDER_GENUS.subn("? ", name, 1) if n: self.pn.type = NameType.PLACEHOLDER # detect further unparsable names if self.PLACEHOLDER.search(name): self.unparsable(NameType.PLACEHOLDER) if self.IS_VIRUS_PATTERN.search(name) or \ self.IS_VIRUS_PATTERN_CASE_SENSITIVE.search(name): self.unparsable(NameType.VIRUS) # detect RNA/DNA gene/strain names and flag as informal if self.IS_GENE.search(name): self.pn.type = NameType.INFORMAL # normalise name name = self.normalize(name) if name is None or name == '': self.unparsable(NameType.NO_NAME) # remove family in front of subfamily ranks name = self.FAMILY_PREFIX.sub(r"\1", name, 1) # check for supraspecific ranks at the beginning of the name m = self.SUPRA_RANK_PREFIX.search(name) if m: self.pn.rank = rankutils.RANK_MARKER_MAP.get(m.group(1).replace(".", "")) name = m.re.sub("", name, 1) # parse cultivar names first BEFORE we strongly normalize # this will potentially remove quotes needed to find cultivar names # this will potentially remove quotes needed to find cultivar group names m = self.CULTIVAR_GROUP.search(name) if m: self.pn.cultivarEpithet = m.group(1) name = m.re.sub(" ", name, 1) cgroup = m.group(2) if cgroup.lower() == "grex" or cgroup.lower() == "gx": self.pn.rank = Rank.GREX else: self.pn.rank = Rank.CULTIVAR_GROUP m = self.CULTIVAR.search(name) if m: self.pn.cultivarEpithet = m.group(2) name = m.re.sub(r"\1", name, 1) self.pn.rank = Rank.CULTIVAR # name without any latin char letter at all? if self.NO_LETTERS.search(name): self.unparsable(NameType.NO_NAME) if self.HYBRID_FORMULA_PATTERN.search(name): self.unparsable(NameType.HYBRID_FORMULA) name, n = self.IS_CANDIDATUS_PATTERN.subn("", name, 1) if n > 0: self.pn.candidatus = True # extract nom.illeg. and other nomen status notes notes = '' for m in self.EXTRACT_NOMSTATUS.finditer(name): if len(notes) > 0: notes += " " note = m.group(1).strip() notes += note # if there was a rank given in the nom status populate the rank marker field rm = self.NOV_RANK_MARKER.search(note) if rm: self.setRank(rm.group(1), True) name = self.EXTRACT_NOMSTATUS.sub("", name) # check if this works as intended self.pn.nomenclaturalNotes = stripToNone(notes) # manuscript names (unpublished names) # http://splink.cria.org.br/docs/appendix_j.pdf m = self.MANUSCRIPT_NAMES.search(name) if m: self.pn.type = NameType.INFORMAL self.pn.addRemark(m.group(0)) self.setRank(m.group(1).replace("indet", "sp")) name = m.re.sub("", name, 1) name, n = self.MANUSCRIPT_SUFFIX.subn("", name, 1) if n: self.pn.type = NameType.INFORMAL # parse out species/strain names with numbers found in Genebank/EBI names, e.g. Advenella kashmirensis W13003 m = self.STRAIN.search(name) if m: name = m.re.sub(m.group(1), name, 1) self.pn.type = NameType.INFORMAL self.pn.strain = m.group(2) # extract sec reference m = self.EXTRACT_SENSU.search(name) if m: self.pn.taxonomicNote = self.normNote(m.group(1)) name = m.re.sub("", name, 1) # extract other remarks m = self.EXTRACT_REMARKS.search(name) if m: self.pn.remarks = stripToNone(m.group(1)) name = m.re.sub("", name, 1) # check for indets m = self.RANK_MARKER_AT_END.search(name) # f. is a marker for forms, but more often also found in authorships as "filius" - son of. # so ignore those if m and not self.FILIUS_AT_END.search(name): # use as rank unless we already have a cultivar self.ignoreAuthorship = True if self.pn.cultivarEpithet is None: self.setRank(m.group(2)) name = m.re.sub("", name) # remove informal identification notes m = self.REPL_AFF.search(name) if m: self.pn.type = NameType.INFORMAL self.pn.addRemark(m.group(0)) name = m.re.sub("", name) # replace bibliographic in references m = self.REPL_IN_REF.search(name) if m: self.pn.addRemark(self.normNote(m.group(0))) name = m.re.sub("", name, 1) # remove superflous epithets with rank markers m = self.REMOVE_INTER_RANKS.search(name) if m: self.pn.addWarning("Intermediate classification removed: " + m.group(1)) name = m.re.sub(r"\2", name, 1) # remember current rank for later reuse preparsingRank = self.pn.rank nameStrongly = self.normalizeStrong(name) if nameStrongly is None or nameStrongly == '': # we might have parsed out remarks already which we treat as a placeholder if self.pn.hasName(): # stop here! self.pn.state = State.COMPLETE self.pn.type = NameType.PLACEHOLDER return else: self.unparsable(NameType.NO_NAME) # try regular parsing parsed = self.parseNormalisedName(nameStrongly) if not parsed: # try to spot a virus name once we know its not a scientific name m = self.IS_VIRUS_PATTERN_POSTFAIL.search(nameStrongly) if m: self.unparsable(NameType.VIRUS) # cant parse it, fail! # Does it appear to be a genuine name starting with a monomial? if self.POTENTIAL_NAME_PATTERN.search(name): self.unparsable(NameType.SCIENTIFIC) else: self.unparsable(NameType.NO_NAME) # did we parse a infraspecic manuscript name? if infraspecEpithet is not None: self.pn.infraspecificEpithet = infraspecEpithet # if we established a rank during preparsing make sure we use this not the parsed one if preparsingRank is not None and preparsingRank != Rank.UNRANKED and self.rank != preparsingRank: self.pn.rank = preparsingRank # determine name type self.determineNameType(name) # flag names that match doubtful patterns self.applyDoubtfulFlag(self.scientificName) # determine rank if not yet assigned if self.pn.rank.otherOrUnranked(): self.pn.rank = rankutils.inferRankFromParsedName(self.pn) # determine code if not yet assigned self.determineCode() def normNote(self, note): if note.startswith("(") and note.endswith(")"): note = note[1:-1] note = note.strip() # punctuation followed by a space, dots are special because of author initials note = re.sub( "([,;)])(?!= )", r"\1 ", note) # opening brackets with space note = re.sub("(? 0: self.pn.addWarning(Warnings.REPL_ENCLOSING_QUOTE) # no question marks after letters (after years they should remain) name, n = self.NO_Q_MARKS.subn(r"\1", name) if n > 0: self.pn.doubtful = True self.pn.addWarning(Warnings.QUESTION_MARKS_REMOVED) # remove prefixes name = self.REPL_RANK_PREFIXES.sub("", name) # remove brackets inside the genus, the kind taxon finder produces name = self.NORM_TF_GENUS.sub(r"\1\2 ", name) # TODO: replace square brackets, keeping content (or better remove all within?) #name = NORM_NO_SQUARE_BRACKETS.matcher(name).replaceAll(" $1 ") # replace different kind of brackets with () name = self.NORM_BRACKETS_OPEN_STRONG.sub("(", name) name = self.NORM_BRACKETS_CLOSE_STRONG.sub(")", name) # add ? genus when name starts with an epithet name, n = self.STARTING_EPITHET.subn(r"? \1", name, 1) if n > 0: self.pn.addWarning(Warnings.MISSING_GENUS) # add parenthesis around subgenus if missing m = self.NORM_SUBGENUS.search(name) if m: # make sure epithet is not a rank mismatch if self.parseRank(m.group(3)) is None: name = m.re.sub(r"\1(\2)\3", name) # finally NORMALIZE PUNCTUATION AND WHITESPACE again name = self.NORM_PUNCTUATIONS.sub(r"\1", name) name = self.NORM_WHITESPACE.sub(" ", name) return name.strip() def preClean(self, name): """ basic careful cleaning, trying to preserve all parsable name parts """ # remove bad whitespace in html entities name = self.XML_ENTITY_STRIP.sub(r"&\1", name) # unescape html entities length = len(name) name = html.unescape(name) if length > len(name): self.pn.addWarning(Warnings.HTML_ENTITIES) # finally remove still existing bad ampersands missing the closing ; name, n = self.AMPERSAND_ENTITY.subn("&", name) if n > 0: self.pn.addWarning(Warnings.HTML_ENTITIES) # replace xml tags name, n = self.XML_TAGS.subn("", name) if n > 0: self.pn.addWarning(Warnings.XML_TAGS) # trim name = name.strip() # remove quotes in beginning and matching ones at the end for c in self.QUOTES: idx = 0 while idx < len(name) and (c == name[idx] or name[idx].isspace()): idx += 1 if idx > 0: # check if we also find this char at the end end = 0 while c == name[len(name) - 1 - end] and (len(name) - idx - end) > 0: end += 1 name = name[idx:len(name) - end + 1] name = self.NORM_WHITESPACE.sub(" ", name) # replace various single quote apostrophes with always ' name = self.NORM_APOSTROPHES.sub("'", name) return name.strip() def setTypeIfNull(self, pn, type_): if pn.type is None: pn.type = type_ def determineNameType(self, normedName): """ Identifies a name type, defaulting to SCIENTIFIC_NAME so that type is never null """ # all rules below do not apply to unparsable names if self.pn.type is None or self.pn.type.isParsable(): # if we only match a monomial in the 3rd pass its suspicious if self.pn.uninomial is not None and normedName[0].islower(): self.pn.addWarning(Warnings.LC_MONOMIAL) self.pn.doubtful = True self.setTypeIfNull(self.pn, NameType.INFORMAL) elif self.pn.rank.notOtherOrUnranked(): if self.pn.isIndetermined(): self.pn.type = NameType.INFORMAL self.pn.addWarning(Warnings.INDETERMINED) elif self.pn.rank.isSupraspecific() and (self.pn.specificEpithet is not None or self.pn.infraspecificEpithet is not None): self.pn.addWarning(Warnings.RANK_MISMATCH) self.pn.doubtful = True self.pn.type = NameType.INFORMAL elif not self.pn.rank.isSpeciesOrBelow() and self.pn.isBinomial(): self.pn.addWarning(Warnings.HIGHER_RANK_BINOMIAL) self.pn.doubtful = True if self.pn.type is None: # an abbreviated name? if self.pn.isAbbreviated() or self.pn.isIncomplete(): self.pn.type = NameType.INFORMAL elif "?" == self.pn.uninomial or "?" == self.pn.genus or "?" == self.pn.specificEpithet: # a placeholder epithet only self.pn.type = NameType.PLACEHOLDER else: # anything else looks fine! self.pn.type = NameType.SCIENTIFIC def applyDoubtfulFlag(self, scientificName): # all rules below do not apply to unparsable names m = self.DOUBTFUL.search(scientificName) if not m: self.pn.doubtful = True self.pn.addWarning(Warnings.UNUSUAL_CHARACTERS) elif self.pn.type.isParsable(): m = self.DOUBTFUL_NULL.search(scientificName) if m: self.pn.doubtful = True self.pn.addWarning(Warnings.NULL_EPITHET) def determineCode(self): if self.pn.code is None: # does the rank tell us sth? if self.pn.rank.isRestrictedToCode() is not None: self.pn.code = self.pn.rank.isRestrictedToCode() elif self.pn.cultivarEpithet is not None: self.pn.code = NomCode.CULTIVARS elif self.pn.sanctioningAuthor is not None: # sanctioning is only for Fungi self.pn.code = NomCode.BOTANICAL elif self.pn.type == NameType.VIRUS: self.pn.code = NomCode.VIRUS elif self.pn.isCandidatus() or self.pn.strain is not None: self.pn.code = NomCode.BACTERIAL # elif self.pn.basionymAuthorship.year is not None or self.pn.combinationAuthorship.year is not None: # if years are given its a zoological name # self.pn.code = NomCode.ZOOLOGICAL elif not self.pn.basionymAuthorship.isEmpty(): if self.pn.combinationAuthorship.isEmpty(): # if only the basionym authorship is given its a zoological name! self.pn.code = NomCode.ZOOLOGICAL else: # if both the basionym and combination authorship is given its a botanical name! self.pn.code = NomCode.BOTANICAL elif self.pn.nomenclaturalNotes is not None and "illeg" in self.pn.nomenclaturalNotes: self.pn.code = NomCode.BOTANICAL def parseNormalisedName(self, name): """ Tries to parse a name string with the full regular expression. In very few, extreme cases names with very long authorships might cause the regex to never finish or take hours we should run this parsing in a separate thread that can be stopped if it runs too long. :return true if the name could be parsed, false in case of failure """ matcher = self.NAME_PATTERN.search(name) if matcher: if matcher.group(21) == '' or matcher.group(21) is None: self.pn.state = State.COMPLETE else: self.pn.state = State.PARTIAL self.pn.unparsed = stripToNone(matcher.group(21)) # the match can be the genus part of an infrageneric, bi- or trinomial, the uninomial or even the infrageneric epithet! self.setUninomialOrGenus(matcher, self.pn) bracketSubrankFound = False if matcher.group(2) is not None: bracketSubrankFound = True self.pn.infragenericEpithet = stripToNone(matcher.group(2)) elif matcher.group(4) is not None: self.setRank(matcher.group(3)) self.pn.infragenericEpithet = stripToNone(matcher.group(4)) self.pn.specificEpithet = stripToNone(matcher.group(5)) if matcher.group(6) is not None and len(matcher.group(6)) > 1 and "null" not in matcher.group(6): # 4 parted name, so its below subspecies self.pn.rank = Rank.INFRASUBSPECIFIC_NAME if matcher.group(7) is not None and matcher.group(7) != "": self.setRank(matcher.group(7)) self.pn.infraspecificEpithet = stripToNone(matcher.group(8)) # microbial ranks if matcher.group(9) is not None: self.setRank(matcher.group(9)) self.pn.infraspecificEpithet = matcher.group(10) # #11 indet rank markers if matcher.group(11) is not None: self.setRank(matcher.group(11)) self.ignoreAuthorship = True # make sure (infra)specific epithet is not a rank marker! self.lookForIrregularRankMarker() if self.pn.isIndetermined(): self.ignoreAuthorship = True # #12 is entire authorship, not stored in ParsedName if not self.ignoreAuthorship and stripToNone(matcher.group(12)) is not None: # #17/18/19/20 authorship (ex/auth/sanct/year) self.pn.combinationAuthorship = self.parseAuthorship(matcher.group(17), matcher.group(18), matcher.group(20)) # sanctioning author if matcher.group(19) is not None: self.pn.sanctioningAuthor = matcher.group(19) # #13/14/15/16 basionym authorship (ex/auth/sanct/year) self.pn.basionymAuthorship = self.parseAuthorship(matcher.group(13), matcher.group(14), matcher.group(16)) if bracketSubrankFound and self.infragenericIsAuthor(self.pn): # rather an author than a infrageneric rank. Swap in case of monomials self.pn.basionymAuthorship = self.parseAuthorship(None, self.pn.infragenericEpithet, None) self.pn.infragenericEpithet = None # check if we need to move genus to uninomial if self.pn.genus is not None and self.pn.specificEpithet is None and self.pn.infraspecificEpithet is None: self.pn.uninomial = self.pn.genus self.pn.genus = None return True return False @staticmethod def cleanYear(matchedYear): if matchedYear is not None and len(matchedYear) > 2: return matchedYear.strip() def setRank(self, rankMarker, force=False): """ Sets the parsed names rank based on a found rank marker Potentially also sets the notho field in case the rank marker indicates a hybrid if the rankMarker cannot be interpreted or is null nothing will be done. """ rank = self.parseRank(rankMarker) if rank is not None and rank.notOtherOrUnranked(): if force: self.pn.rank = rank else: self.setRankIfNotContradicting(rank) if rankMarker.startswith(self.NOTHO): if rank.isInfraspecific(): self.pn.notho = NamePart.INFRASPECIFIC elif rank == Rank.SPECIES: self.pn.notho = NamePart.SPECIFIC elif rank.isInfrageneric(): self.pn.notho = NamePart.INFRAGENERIC elif rank == Rank.GENUS: self.pn.notho = NamePart.GENERIC def setRankIfNotContradicting(self, rank): """ Sets the rank if the current rank of the parsed name is not contradicting to the given one. Mostly this is the case to define a Unranked rank. """ if self.pn.rank.isUncomparable(): if (self.pn.rank == Rank.INFRAGENERIC_NAME and not rank.isInfragenericStrictly()) \ or (self.pn.rank == Rank.INFRASPECIFIC_NAME and not rank.isInfraspecific()) \ or (self.pn.rank == Rank.INFRASUBSPECIFIC_NAME and not rank.isInfrasubspecific()): return self.pn.rank = rank @staticmethod def parseRank(rankMarker): return rankutils.inferRankFromMarker(stripToNone(rankMarker)) @classmethod def infragenericIsAuthor(cls, pn): return pn.basionymAuthorship.isEmpty() \ and pn.specificEpithet is None \ and pn.infraspecificEpithet is None \ and not pn.rank.isInfragenericStrictly() \ and not cls.LATIN_ENDINGS.search(pn.infragenericEpithet) @staticmethod def setUninomialOrGenus(matcher, pn): """ The first Capitalized word can be stored in 3 different places in ParsedName. Figure out where to best keep it: a) as the genus part of an infrageneric, bi- or trinomial b) the uninomial for names of rank genus or higher c) the infrageneric epithet in case its a standalone infrageneric name (which is hard to detect) """ # the match can be the genus part of a bi/trinomial or a uninomial monomial = stripToNone(matcher.group(1)) if matcher.group(2) is not None \ or matcher.group(4) is not None \ or matcher.group(5) is not None \ or matcher.group(8) is not None \ or pn.rank.isSpeciesOrBelow(): # and self.pn.getRank().isRestrictedToCode() != NomCode.CULTIVARS pn.genus = monomial elif pn.rank.isInfragenericStrictly(): pn.setInfragenericEpithet(monomial) else: pn.uninomial = monomial def lookForIrregularRankMarker(self): """ if no rank marker is set, inspect epitheta for wrongly placed rank markers and modify parsed name accordingly. This is sometimes the case for informal names like: Coccyzus americanus ssp. """ if self.pn.rank.otherOrUnranked(): if self.pn.infraspecificEpithet is not None: m = self.RANK_MARKER_ONLY.search(self.pn.infraspecificEpithet) if m: # we found a rank marker, make it one self.setRank(self.pn.infraspecificEpithet) self.pn.infraspecificEpithet = None if self.pn.specificEpithet is not None: m = self.RANK_MARKER_ONLY.search(self.pn.specificEpithet) if m: # we found a rank marker, make it one self.setRank(self.pn.specificEpithet) self.pn.specificEpithet = None elif self.pn.rank == Rank.SPECIES and self.pn.infraspecificEpithet is not None: # sometimes sp. is wrongly used as a subspecies rankmarker self.pn.rank = Rank.SUBSPECIES self.pn.addWarning(Warnings.SUBSPECIES_ASSIGNED) @classmethod def parseAuthorship(cls, ex, authors, year): a = Authorship() if authors is not None: a.authors = cls.splitTeam(authors) if ex is not None: a.exAuthors = cls.splitTeam(ex) a.year = cls.cleanYear(year) return a @classmethod def splitTeam(cls, team): """ Splits an author team by either ; or , """ # treat semicolon differently. Single author name can contain a comma now! if ";" in team: authors = [] for a in team.split(";"): m = cls.AUTHOR_INITIAL_SWAP.search(a) if m: authors.append(cls.normAuthor(m.group(2) + " " + m.group(1), True)) else: authors.append(cls.normAuthor(a, False)) return authors m = cls.AUTHORTEAM_DELIMITER.search(team) if m: return cls.AUTHORTEAM_DELIMITER.split(cls.normAuthor(team, False)) else: # we sometimes see space delimited authorteams with the initials consistently at the end of a single author: # Balsamo M Fregni E Tongiorgi MA SPACE_AUTHOR = "(\\p{Lu}\\p{Ll}+ \\p{Lu}+)" AUTHORTEAM_SPACED = re.compile("^" + SPACE_AUTHOR + "(?: " + SPACE_AUTHOR + ")*$") AUTHOR_SPACED = re.compile("(\\p{Lu}\\p{Ll}+) (\\p{Lu}+)") ms = AUTHORTEAM_SPACED.findall(team) if ms: # We should be able to extract the authors from the first match instead of doing it again authors = [] for g in ms: sb = "" for initial in g[1]: sb += initial + '.' sb += g[0] authors.append(sb) return authors else: # no delimiters found, treat as one author return [cls.normAuthor(team, False)] @classmethod def normAuthor(cls, authors, normPunctuation): """ Author strings are normalized by removing any whitespace following a dot. See IPNI author standard form recommendations: http://www.ipni.org/standard_forms_author.html """ if normPunctuation: authors = cls.NORM_PUNCTUATIONS.sub(r"\1", authors) return stripToNone(authors)