In [2]:
#  Importing libraries and module and some setting for notebook

import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct  #Cosine Similarity
import time

In [33]:
outfile='wos_extract_complete.csv'
record_col="UT (Unique WOS ID)"
wos = pd.read_csv(outfile, sep="\t",low_memory=False)

affiliations = wos.groupby(record_col)["Affiliations"].apply(lambda x: x.str.split(';')).explode().reset_index().drop(columns="level_1")
# affiliations[affiliations["Affiliations"].str.lower().str.contains("chinese academy", na=False, regex=True)]["Affiliations"].value_counts()
affiliations["Affiliations"] = affiliations["Affiliations"].str.strip().str.upper()
affiliations = affiliations.drop_duplicates()

df = affiliations["Affiliations"].fillna("UNKNOWN").to_frame().reset_index().drop(columns="index")

In [25]:
type(df)

pandas.core.frame.DataFrame

In [15]:
df.value_counts()

Affiliations
CHINESE ACADEMY OF SCIENCES                            1237
UDICE-FRENCH RESEARCH UNIVERSITIES                      664
CENTRE NATIONAL DE LA RECHERCHE SCIENTIFIQUE (CNRS)     658
HELMHOLTZ ASSOCIATION                                   432
TSINGHUA UNIVERSITY                                     429
                                                       ... 
UNIVERSIDAD PONTIFICIA BOLIVARIANA                        1
CALIFORNIA STATE UNIVERSITY CHICO                         1
KOREA ENVIRONMENT INSTITUTE (KEI)                         1
NATIONAL INSTITUTE OF TECHNOLOGY MEGHALAYA                1
SAINT JAMES'S UNIVERSITY HOSPITAL                         1
Name: count, Length: 4906, dtype: int64

In [9]:
df[df.str.contains("CHINESE ACADEMY",na=False)].value_counts()

Affiliations
CHINESE ACADEMY OF SCIENCES                                           1237
UNIVERSITY OF CHINESE ACADEMY OF SCIENCES, CAS                         420
CHINESE ACADEMY OF AGRICULTURAL SCIENCES                                50
CHINESE ACADEMY OF MEDICAL SCIENCES - PEKING UNION MEDICAL COLLEGE      50
CHINESE ACADEMY OF ENGINEERING PHYSICS                                  11
CHINESE ACADEMY OF FORESTRY                                              9
CHINESE ACADEMY OF SURVEYING & MAPPING                                   8
CHINESE ACADEMY OF SOCIAL SCIENCES                                       7
CHINESE ACADEMY OF METEOROLOGICAL SCIENCES (CAMS)                        7
CHINESE ACADEMY OF TROPICAL AGRICULTURAL SCIENCES                        3
CHINESE ACADEMY OF GEOLOGICAL SCIENCES                                   3
CHINESE ACADEMY OF FISHERY SCIENCES                                      2
UNIVERSITY OF CHINESE ACADEMY OF SOCIAL SCIENCES                         1
Name: count,

In [35]:
#  ngrams(here we are taking n = 3 thus 3-gram (trigrams ) as  most room types only contain two or three words
#  used for cleaning and removing some punctuation (dots, comma’s etc) i.e.((,-./)) from a string
#  and generate and collect all n-grams of the string.


def ngrams(string, n=3):

    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]




TypeError: expected string or bytes-like object

In [36]:
# After having each words split (token or  lemmas (n-gram generated items) ) into a vector and
# Scikit-learn’s  Tfidfvectorizer aim to do the same thing, which is to convert a collection of raw documents to a matrix of TF-IDF features.
# Generate the matrix of TF-IDF (Term Frequency-Inverse Document frequency)values for each


types = df['Affiliations']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(types)

In [27]:
tf_idf_matrix

<65153x6417 sparse matrix of type '<class 'numpy.float64'>'
	with 1752829 stored elements in Compressed Sparse Row format>

In [37]:
# calculate the similarity between two vectors of TF-IDF values the Cosine Similarity is usually used.
# result matrix in a very sparse terms and Scikit-learn deals with this nicely by returning a sparse CSR matrix.

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape

    idx_dtype = np.int32

    nnz_max = M*ntop

    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))


In [38]:
#  Run the optimized cosine similarity function.
#  Only stores the top 10 most similar items with a similarity above 0.8

t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)


SELFTIMED: 47.96097183227539


In [29]:
# unpacks the resulting sparse matrix

def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()

    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]

    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size

    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)

    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]

    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similarity': similairity})

In [51]:


# store the  matches into new dataframe called matched_df and printing 10 samples

matches_df = get_matches_df(matches, types, top=599478)
matches_df = matches_df[matches_df['similarity'] < 0.99999] # For removing all exact matches
matches_df



Unnamed: 0,left_side,right_side,similarity
3274,LOUISIANA STATE UNIVERSITY,LOUISIANA STATE UNIVERSITY SYSTEM,0.883792
7376,SHAOYANG UNIVERSITY,CHAOYANG UNIVERSITY OF TECHNOLOGY,0.807530
7377,SHAOYANG UNIVERSITY,CHAOYANG UNIVERSITY OF TECHNOLOGY,0.807530
7378,SHAOYANG UNIVERSITY,CHAOYANG UNIVERSITY OF TECHNOLOGY,0.807530
7379,SHAOYANG UNIVERSITY,CHAOYANG UNIVERSITY OF TECHNOLOGY,0.807530
...,...,...,...
599251,CHANGZHOU UNIVERSITY,YANGZHOU UNIVERSITY,0.846442
599252,CHANGZHOU UNIVERSITY,YANGZHOU UNIVERSITY,0.846442
599253,CHANGZHOU UNIVERSITY,YANGZHOU UNIVERSITY,0.846442
599254,CHANGZHOU UNIVERSITY,YANGZHOU UNIVERSITY,0.846442


Unnamed: 0,left_side,right_side,similarity
0,NATURAL HISTORY MUSEUM LONDON,NATURAL HISTORY MUSEUM LONDON,1.0
1,NATURAL HISTORY MUSEUM LONDON,NATURAL HISTORY MUSEUM LONDON,1.0
2,NATURAL HISTORY MUSEUM LONDON,NATURAL HISTORY MUSEUM LONDON,1.0
3,NATURAL HISTORY MUSEUM LONDON,NATURAL HISTORY MUSEUM LONDON,1.0
4,BULGARIAN ACADEMY OF SCIENCES,BULGARIAN ACADEMY OF SCIENCES,1.0
...,...,...,...
195,DELFT UNIVERSITY OF TECHNOLOGY,DELFT UNIVERSITY OF TECHNOLOGY,1.0
196,DELFT UNIVERSITY OF TECHNOLOGY,DELFT UNIVERSITY OF TECHNOLOGY,1.0
197,NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA,NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA,1.0
198,NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA,NATIONAL UNIVERSITY OF DEFENSE TECHNOLOGY - CHINA,1.0
