gambolputty / german-nouns

A list of ~100,000 German nouns and their grammatical properties compiled from WiktionaryDE as CSV file. Plus a module to look up the data and parse compound words.

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

add non exact match strategy

lsmith77 opened this issue · comments

in our testing we found too many words missing than we can realistically add to Wiktionary (see also #8). so we now implemented the following strategy which at least allows us to detect the genus. would this be interesting to add to your package?

primary_german_genus_endings = {
    "n": [
        "chen",
        "ett",
        "eau",
        "lein",
        "icht",
        "il",
        "ium",
        "it",
        "ma",
        "ment",
        "tel",
        "tum",
        "um",
    ],
    "f": [
        "in",
        "a",
        "ade",
        "age",
        "anz",
        "elle",
        "ette",
        "ere",
        "enz",
        "ei",
        "ine",
        "isse",
        "itis",
        "ive",
        "ie",
        "heit",
        "keit",
        "ik",
        "sion",
        "se",
        "sis",
        "tät",
        "ung",
        "ur",
        "schaft",
    ],
    "m": [
        "ant",
        "ast",
        "ich",
        "ist",
        "ig",
        "ling",
        "or",
        "us",
        "ismus",
        "är",
        "eur",
        "iker",
        "ps",
    ],
}

secondary_german_genus_endings = {
    # 3 out of four words ending with -nis and -sal are neuter nouns
    "n": [
        "nis", "sal",
    ],
    # There are exceptions such as Postillion, which is masculine while the oberwhelming majority of -ion words in German is feminine.
    "f": [
        "ion",
    ],
    # More than half of words ending with -er, -en, -el are masculine
    "m": [
        "er", "en", "el",
    ],
}

def determine_genus_from_ending(word, german_genus_endings):
    for genus in german_genus_endings:
        for ending in german_genus_endings[genus]:
            if word.endswith(ending):
                return {"genus": genus}

    return None


def german_noun_lookup(word):
    result = german_nouns[word]
    if not len(result):
        return None

    result = result[0]

    if "genus" in result:
        return result

    if "genus 1" in result:
        result["genus"] = result["genus 1"]

        return result

    if word[-5:].lower() == "leute":
        result["genus"] = "f"

        return result

    genus_result = determine_genus_from_ending(word, primary_german_genus_endings)
    if genus_result == None or "genus" not in genus_result:
        genus_result = determine_genus_from_ending(word, secondary_german_genus_endings)
        if genus_result == None or "genus" not in genus_result:
            return None

    result["genus"] = genus_result["genus"]

    return result


def german_noun_analysis(word, genus_only=False):
    result = german_noun_lookup(word)
    if result != None:
        return result

    if genus_only:
        result = determine_genus_from_ending(word, primary_german_genus_endings)

        if result != None:
            return result

    # skip the first 2 letters
    i = 2

    # skip the last 2 letters
    while i < len(word) - 2:
        partial_word = word[i:]

        # avoid cases like 'Ende' at the end of 'Arbeitgebende'
        if partial_word == "ende":
            break

        result = german_noun_lookup(partial_word.capitalize())
        if result == None:
            i += 1
            continue

        result["Lemma"] = word
        if not genus_only:
            word_prefix = word[0:i]
            for flexion in result["flexion"]:
                result["flexion"][flexion] = (
                    word_prefix + result["flexion"][flexion].lower()
                )

        return result

    if genus_only:
        result = determine_genus_from_ending(word, primary_german_genus_endings)

    return result