Module faq_forum.question_match

Expand source code
from fuzzywuzzy import fuzz
import gensim
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
import pickle

stop_words = set(stopwords.words("english"))

W2VModel = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300-SLIM.bin.gz", binary=True)

# Load the question matching model with 4 threads
bst_loaded = xgb.Booster({"nthread": 4})
bst_loaded.load_model("0001.model")

feature_labels =\
    ['len_q1', 'len_q2', 'diff_len', 'len_char_q1', 'len_char_q2',
     'len_word_q1', 'len_word_q2', 'common_words', 'fuzz_QRatio',
     'fuzz_WRatio', 'fuzz_partial_ratio', 'fuzz_partial_token_set_ratio',
     'fuzz_partial_token_sort_ratio', 'fuzz_token_set_ratio', 'fuzz_token_sort_ratio',
     'cosine_distance', 'cityblock_distance', 'jaccard_distance', 'canberra_distance', 'euclidean_distance',
     'minkowski_distance', 'braycurtis_distance', 'wmd', 'norm_wmd']

# 1x uitvoeren
# nltk.download("punkt")
# nltk.download("stopwords")

"""
Method getFeatures(question1, question2) returns dictionary of features given two questions
"len_q1"                            =   length of the first string
"len_q2"                            =   length of second string
"diff_len"                          =   difference in length (len_q1-len_q2)
"len_char_q1"                       =   length of the first string without the spaces
"len_char_q2"                       =   length of the second string without the spaces
"len_word_q1"                       =   word count of the first string
"len_word_q2"                       =   word count of the second string
"common_words"                      =   count of words the two strings have in common
"fuzz_Qratio"                       =   Q ratio of the strings
"fuzz_Wratio"                       =   W ratio of the string
"fuzz_partial_ratio"                =   partial ratio of the strings
"fuzz_partial_token_set_ratio"      =   partial token set ratio
"fuzz_partial_token_sort_ratio"     =   partial token sort ratio
"fuzz_token_set_ratio"              =   token set ratio
"fuzz_token_sort_ratio"             =   token sort ratio
"cosine_distance": cosine(W2V[0], W2V[1]),
"cityblock_distance": cityblock(W2V[0], W2V[1]),
"jaccard_distance": jaccard(W2V[0], W2V[1]),
"canberra_distance": canberra(W2V[0], W2V[1]),
"euclidean_distance": euclidean(W2V[0], W2V[1]),
"minkowski_distance": minkowski(W2V[0], W2V[1]),
"braycurtis_distance": braycurtis(W2V[0], W2V[1]),
"wmd": W2V[2],
"norm_wmd": W2V[3]
"""


def get_features(question1, question2):
    """
    Get all the features to input into the XGBoost model.

    Pre: Both questions cannot be None.
    Args:
        question1: The first question to match.
        question2: The second question to match.

    Returns: A dictionary with all the features for the XGBoost model.

    """
    w2v = word2vec_features(question1, question2, W2VModel)
    output_dict = {
        # length based features
        "len_q1": [len(question1)],
        "len_q2": [len(question2)],
        "diff_len": [len(question1) - len(question2)],
        "len_char_q1": [len(question1.replace(" ", ""))],
        "len_char_q2": [len(question2.replace(" ", ""))],
        "len_word_q1": [len(question1.split())],
        "len_word_q2": [len(question2.split())],
        "common_words": [len(set(question1.lower().split()).intersection(set(question2.lower().split())))],
        # distance based features
        #   (fuzzywuzzy library tutorial: https://www.datacamp.com/community/tutorials/fuzzy-string-python)
        "fuzz_Qratio": [fuzz.QRatio(question1, question2)],
        "fuzz_Wratio": [fuzz.WRatio(question1, question2)],
        "fuzz_partial_ratio": [fuzz.partial_ratio(question1, question2)],
        "fuzz_partial_token_set_ratio": [fuzz.partial_token_set_ratio(question1, question2)],
        "fuzz_partial_token_sort_ratio": [fuzz.partial_token_sort_ratio(question1, question2)],
        "fuzz_token_set_ratio": [fuzz.token_set_ratio(question1, question2)],
        "fuzz_token_sort_ratio": [fuzz.token_sort_ratio(question1, question2)],

        # word2vec based features
        "cosine_distance": [cosine(w2v[0], w2v[1])],
        "cityblock_distance": [cityblock(w2v[0], w2v[1])],
        "jaccard_distance": [jaccard(w2v[0], w2v[1])],
        "canberra_distance": [canberra(w2v[0], w2v[1])],
        "euclidean_distance": [euclidean(w2v[0], w2v[1])],
        "minkowski_distance": [minkowski(w2v[0], w2v[1])],
        "braycurtis_distance": [braycurtis(w2v[0], w2v[1])],
        "wmd": [w2v[2]],
        "norm_wmd": [w2v[3]]

    }
    return output_dict


def match(question1, question2):
    """
    Compute and return the probability that the two given questions are semantically the same.

    Args:
        question1: The first question
        question2: The second question

    Returns: The probability that the two given questions are semantically the same as a float p (1 = equal, 0 = different, 0 <= p <= 1)

    """
    # Calculate the features
    features = get_features(question1, question2)
    # Transform the featuers to a pd DataFrame
    features = pd.DataFrame(features, columns=feature_labels)
    # Clean up infinites and NaNs
    features = features.replace([np.inf, -np.inf], np.nan).fillna(0).values
    # Load the scaler and scale the data
    scaler = pickle.load(open("scaler.p", "rb"))
    features = scaler.transform(features)
    # Give the features to the model for prediction
    model_input = xgb.DMatrix(features)
    pred = bst_loaded.predict(model_input)
    return pred[0]


def get_wmd(question1, question2, model):
    """
    Get the word-mover's distance between the two given questions.

    Pre: No parameters may be None.
    Args:
        question1: The first question to match.
        question2: The second question to match.
        model: A word2vec-model.

    Returns: The word-mover's distance between the two questions.

    """
    s1 = question1.lower().split()
    s2 = question2.lower().split()
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)


def word2vec_features(question1, question2, model):
    """
    Get the word2vec features of the two questions.

    Pre: No parameters may be None.
    Args:
        question1: The first question to match.
        question2: The second question to match.
        model: A word2vec-model.

    Returns: A list with the word2vec features between the two questions.

    """
    # Calculate the sent2vec vectors for every question
    w2v_q1 = np.array(sent2vec(question1, model))
    w2v_q2 = np.array(sent2vec(question2, model))
    wmd = get_wmd(question1, question2, model)
    model.init_sims(replace=True)
    norm_wmd = get_wmd(question1, question2, model)
    return [w2v_q1, w2v_q2, wmd, norm_wmd]


# Google's Word2vec model expects words as input, so sentences must be
# transformed to vectors indirectly
def sent2vec(s, model):
    """
    Transform a sentence to a vector.

    Pre: No parameters may be None.
    Args:
        s: The sentence to transform.
        model: A word2vec model.

    Returns: A vector, representing the given sentence.

    """
    words = word_tokenize(s.lower())
    # Stopwords and numbers must be removed, as well as words that are not
    # part of the model
    M = [model[w] for w in words if w not in stop_words and w.isalpha() and w in model]
    M = np.array(M)
    if len(M) > 0:
        v = M.sum(axis=0)
        return v / np.sqrt((v ** 2).sum())
    else:
        # When the sentence is empty after removing unvalid tokens, the vector
        # is equal to the null-vector
        return model.get_vector('null')


"""
prediction = match("Where is the coffee machine?", "Where can I find the coffee machine?")
print(prediction)
prediction = match("Where is the coffee machine?", "When did the Titanic sink?")
print(prediction)
prediction = match("How can I open the coffee machine?", "Where is the coffee machine?")
print(prediction)
prediction = match("How big is the coffee machine?", "How can I use the coffee machine?")
print(prediction)
prediction = match("Where is the coffee machine?", "Where is the coffee machine?")
print(prediction)
"""

Global variables

var feature_labels

Method getFeatures(question1, question2) returns dictionary of features given two questions "len_q1" = length of the first string "len_q2" = length of second string "diff_len" = difference in length (len_q1-len_q2) "len_char_q1" = length of the first string without the spaces "len_char_q2" = length of the second string without the spaces "len_word_q1" = word count of the first string "len_word_q2" = word count of the second string "common_words" = count of words the two strings have in common "fuzz_Qratio" = Q ratio of the strings "fuzz_Wratio" = W ratio of the string "fuzz_partial_ratio" = partial ratio of the strings "fuzz_partial_token_set_ratio" = partial token set ratio "fuzz_partial_token_sort_ratio" = partial token sort ratio "fuzz_token_set_ratio" = token set ratio "fuzz_token_sort_ratio" = token sort ratio "cosine_distance": cosine(W2V[0], W2V[1]), "cityblock_distance": cityblock(W2V[0], W2V[1]), "jaccard_distance": jaccard(W2V[0], W2V[1]), "canberra_distance": canberra(W2V[0], W2V[1]), "euclidean_distance": euclidean(W2V[0], W2V[1]), "minkowski_distance": minkowski(W2V[0], W2V[1]), "braycurtis_distance": braycurtis(W2V[0], W2V[1]), "wmd": W2V[2], "norm_wmd": W2V[3]

Functions

def get_features(question1, question2)

Get all the features to input into the XGBoost model.

Pre: Both questions cannot be None.

Args

question1
The first question to match.
question2
The second question to match.

Returns: A dictionary with all the features for the XGBoost model.

Expand source code
def get_features(question1, question2):
    """
    Get all the features to input into the XGBoost model.

    Pre: Both questions cannot be None.
    Args:
        question1: The first question to match.
        question2: The second question to match.

    Returns: A dictionary with all the features for the XGBoost model.

    """
    w2v = word2vec_features(question1, question2, W2VModel)
    output_dict = {
        # length based features
        "len_q1": [len(question1)],
        "len_q2": [len(question2)],
        "diff_len": [len(question1) - len(question2)],
        "len_char_q1": [len(question1.replace(" ", ""))],
        "len_char_q2": [len(question2.replace(" ", ""))],
        "len_word_q1": [len(question1.split())],
        "len_word_q2": [len(question2.split())],
        "common_words": [len(set(question1.lower().split()).intersection(set(question2.lower().split())))],
        # distance based features
        #   (fuzzywuzzy library tutorial: https://www.datacamp.com/community/tutorials/fuzzy-string-python)
        "fuzz_Qratio": [fuzz.QRatio(question1, question2)],
        "fuzz_Wratio": [fuzz.WRatio(question1, question2)],
        "fuzz_partial_ratio": [fuzz.partial_ratio(question1, question2)],
        "fuzz_partial_token_set_ratio": [fuzz.partial_token_set_ratio(question1, question2)],
        "fuzz_partial_token_sort_ratio": [fuzz.partial_token_sort_ratio(question1, question2)],
        "fuzz_token_set_ratio": [fuzz.token_set_ratio(question1, question2)],
        "fuzz_token_sort_ratio": [fuzz.token_sort_ratio(question1, question2)],

        # word2vec based features
        "cosine_distance": [cosine(w2v[0], w2v[1])],
        "cityblock_distance": [cityblock(w2v[0], w2v[1])],
        "jaccard_distance": [jaccard(w2v[0], w2v[1])],
        "canberra_distance": [canberra(w2v[0], w2v[1])],
        "euclidean_distance": [euclidean(w2v[0], w2v[1])],
        "minkowski_distance": [minkowski(w2v[0], w2v[1])],
        "braycurtis_distance": [braycurtis(w2v[0], w2v[1])],
        "wmd": [w2v[2]],
        "norm_wmd": [w2v[3]]

    }
    return output_dict
def get_wmd(question1, question2, model)

Get the word-mover's distance between the two given questions.

Pre: No parameters may be None.

Args

question1
The first question to match.
question2
The second question to match.
model
A word2vec-model.

Returns: The word-mover's distance between the two questions.

Expand source code
def get_wmd(question1, question2, model):
    """
    Get the word-mover's distance between the two given questions.

    Pre: No parameters may be None.
    Args:
        question1: The first question to match.
        question2: The second question to match.
        model: A word2vec-model.

    Returns: The word-mover's distance between the two questions.

    """
    s1 = question1.lower().split()
    s2 = question2.lower().split()
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)
def match(question1, question2)

Compute and return the probability that the two given questions are semantically the same.

Args

question1
The first question
question2
The second question
Returns : The probability that the two given questions are semantically the same as a float p (1 = equal, 0 = different, 0 <= p <= 1)
 
Expand source code
def match(question1, question2):
    """
    Compute and return the probability that the two given questions are semantically the same.

    Args:
        question1: The first question
        question2: The second question

    Returns: The probability that the two given questions are semantically the same as a float p (1 = equal, 0 = different, 0 <= p <= 1)

    """
    # Calculate the features
    features = get_features(question1, question2)
    # Transform the featuers to a pd DataFrame
    features = pd.DataFrame(features, columns=feature_labels)
    # Clean up infinites and NaNs
    features = features.replace([np.inf, -np.inf], np.nan).fillna(0).values
    # Load the scaler and scale the data
    scaler = pickle.load(open("scaler.p", "rb"))
    features = scaler.transform(features)
    # Give the features to the model for prediction
    model_input = xgb.DMatrix(features)
    pred = bst_loaded.predict(model_input)
    return pred[0]
def sent2vec(s, model)

Transform a sentence to a vector.

Pre: No parameters may be None.

Args

s
The sentence to transform.
model
A word2vec model.

Returns: A vector, representing the given sentence.

Expand source code
def sent2vec(s, model):
    """
    Transform a sentence to a vector.

    Pre: No parameters may be None.
    Args:
        s: The sentence to transform.
        model: A word2vec model.

    Returns: A vector, representing the given sentence.

    """
    words = word_tokenize(s.lower())
    # Stopwords and numbers must be removed, as well as words that are not
    # part of the model
    M = [model[w] for w in words if w not in stop_words and w.isalpha() and w in model]
    M = np.array(M)
    if len(M) > 0:
        v = M.sum(axis=0)
        return v / np.sqrt((v ** 2).sum())
    else:
        # When the sentence is empty after removing unvalid tokens, the vector
        # is equal to the null-vector
        return model.get_vector('null')
def word2vec_features(question1, question2, model)

Get the word2vec features of the two questions.

Pre: No parameters may be None.

Args

question1
The first question to match.
question2
The second question to match.
model
A word2vec-model.

Returns: A list with the word2vec features between the two questions.

Expand source code
def word2vec_features(question1, question2, model):
    """
    Get the word2vec features of the two questions.

    Pre: No parameters may be None.
    Args:
        question1: The first question to match.
        question2: The second question to match.
        model: A word2vec-model.

    Returns: A list with the word2vec features between the two questions.

    """
    # Calculate the sent2vec vectors for every question
    w2v_q1 = np.array(sent2vec(question1, model))
    w2v_q2 = np.array(sent2vec(question2, model))
    wmd = get_wmd(question1, question2, model)
    model.init_sims(replace=True)
    norm_wmd = get_wmd(question1, question2, model)
    return [w2v_q1, w2v_q2, wmd, norm_wmd]