Source code for smarter.apps.plugin.nlp

"""Natural language processing functions for OpenAI API-compatible providers."""

import re
import string

import Levenshtein



[docs]
def clean_prompt(prompt: str) -> str:
    """
    Clean up a prompt by inserting spaces before capital letters.

    This function transforms concatenated or camel-cased words into a more readable format by
    adding spaces before capital letters, except for names starting with "Mc". Useful for
    improving prompt clarity in NLP tasks.

    :param prompt: The input string to clean.
    :type prompt: str

    :return: The cleaned string with spaces before capital letters.
    :rtype: str

    .. note::
        This is a simple heuristic and may not handle all edge cases perfectly.
        For example, names starting with "Mc" (e.g., "McDaniel") are not split.

    .. tip::

        Use this function to preprocess user input or model prompts for better readability.

    .. seealso::

        - :func:`lower_case_splitter`
        - :func:`simple_search`

    **Example usage**:

    .. code-block:: python

        from smarter.apps.plugin.nlp import clean_prompt

        s = "WhoIsLawrenceMcDaniel"
        print(clean_prompt(s))
        # Output: "Who Is Lawrence McDaniel"
    """
    pattern = r"(?<!Mc)([A-Z][a-z]+)|(?<!Mc)([A-Z]+)"
    retval = []
    for word in prompt.split():
        word = word.translate(str.maketrans("", "", string.punctuation))
        words = re.sub(pattern, r" \1\2", word).split()
        retval.extend(words)
    retval = " ".join(retval)
    return retval




[docs]
def lower_case_splitter(string_of_words: str) -> list:
    """
    Split a string on spaces and return a list of lowercase words.

    This function tokenizes a string by spaces and converts each token to lowercase.
    Useful for case-insensitive text processing, search, and normalization in NLP tasks.

    :param string_of_words: The input string to split and lowercase.
    :type string_of_words: str

    :return: List of lowercase words.
    :rtype: list[str]

    .. tip::

        Use this function to prepare text for matching, searching, or comparison.

    .. seealso::

        - :func:`clean_prompt`
        - :func:`simple_search`

    **Example usage**:

    .. code-block:: python

        from smarter.apps.plugin.nlp import lower_case_splitter

        s = "The Quick Brown Fox"
        print(lower_case_splitter(s))
        # Output: ['the', 'quick', 'brown', 'fox']

    """
    return [word.lower() for word in string_of_words.split()]




[docs]
def simple_search(prompt: str, search_term: str) -> bool:
    """
    Check if the prompt contains the target string.

    This function performs a case-insensitive search for the `search_term` within the `prompt`.
    It also checks if all tokens in the search term appear in the prompt, regardless of order.

    :param prompt: The input string to search within.
    :type prompt: str
    :param search_term: The target string or phrase to look for.
    :type search_term: str

    :return: `True` if the search term is found in the prompt, otherwise `False`.
    :rtype: bool

    .. tip::

        Use this function for simple keyword or phrase matching in user prompts or text analysis.

    .. caution::

        This function does not perform fuzzy matching or handle typos. For more advanced matching,
        consider using :func:`within_levenshtein_distance`.

    .. seealso::

        - :func:`lower_case_splitter`
        - :func:`within_levenshtein_distance`
        - :func:`does_refer_to`

    **Example usage**:

    .. code-block:: python

        from smarter.apps.plugin.nlp import simple_search

        prompt = "Find all weather plugins for New York"
        print(simple_search(prompt, "weather plugins"))  # True
        print(simple_search(prompt, "Weather"))          # True
        print(simple_search(prompt, "California"))       # False

    """

    # simplest possible case: the search term is in the prompt
    if search_term.lower() in prompt.lower():
        return True

    prompt_words = lower_case_splitter(prompt)
    token_count = len(search_term.split())
    found_count = 0
    for token in lower_case_splitter(search_term):
        if token in prompt_words:
            found_count += 1
        if found_count >= token_count:
            return True
    return False




[docs]
def within_levenshtein_distance(prompt: str, search_term: str, threshold: int = 3) -> bool:
    """
    Check if the prompt is within the given Levenshtein distance of the target string.

    This function compares each title-cased word in the prompt to the `search_term` using the
    Levenshtein distance metric. If any word is within the specified `threshold`, the function
    returns `True`. Useful for fuzzy matching and typo-tolerant search.

    :param prompt: The input string to search within.
    :type prompt: str
    :param search_term: The target string to compare against.
    :type search_term: str
    :param threshold: The maximum allowed Levenshtein distance for a match (default: 3).
    :type threshold: int

    :return: `True` if a word in the prompt is within the threshold distance of the search term, otherwise `False`.
    :rtype: bool

    .. tip::

        Adjust the `threshold` parameter to control the strictness of fuzzy matching.

    .. caution::

        Only title-cased words in the prompt are considered for comparison.

    .. seealso::

        - :func:`simple_search`
        - :func:`does_refer_to`
        - `Levenshtein.distance <https://pypi.org/project/python-Levenshtein/>`_

    **Example usage**:

    .. code-block:: python

        from smarter.apps.plugin.nlp import within_levenshtein_distance

        prompt = "Find all plugins for Lawrance McDaniel"
        print(within_levenshtein_distance(prompt, "Lawrence", threshold=2))  # True

    """
    words = lower_case_splitter(prompt)
    names = [word for word in words if word.istitle()]
    for name in names:
        distance = Levenshtein.distance(search_term, name)
        if distance <= threshold:
            return True
    return False




[docs]
def does_refer_to(prompt: str, search_term: str, threshold=3) -> bool:
    """
    Check if the prompt refers to the given string.

    This function determines whether a prompt refers to a target string by first cleaning the prompt,
    then performing both direct and fuzzy matching. It uses :func:`simple_search` for exact or token-based
    matches, and :func:`within_levenshtein_distance` for typo-tolerant fuzzy matches.

    :param prompt: The input string to analyze.
    :type prompt: str
    :param search_term: The target string to check for reference.
    :type search_term: str
    :param threshold: The maximum Levenshtein distance for fuzzy matching (default: 3).
    :type threshold: int

    :return: `True` if the prompt refers to the search term, otherwise `False`.
    :rtype: bool

    .. important::

        This function combines both exact and fuzzy matching for robust reference detection.

    .. tip::

        Adjust the `threshold` parameter for stricter or looser fuzzy matching.

    .. seealso::

        - :func:`clean_prompt`
        - :func:`simple_search`
        - :func:`within_levenshtein_distance`

    **Example usage**:

    .. code-block:: python

        from smarter.apps.plugin.nlp import does_refer_to

        prompt = "WhoIsLawranceMcDaniel"
        print(does_refer_to(prompt, "Lawrence McDaniel"))  # True
        print(does_refer_to(prompt, "John Doe"))           # False

    """

    prompt = clean_prompt(prompt)

    if simple_search(prompt=prompt, search_term=search_term):
        return True

    if within_levenshtein_distance(prompt=prompt, search_term=search_term, threshold=threshold):
        return True

    # bust. we didn't find the target string in the prompt
    return False