# Script objective: # SpaCy: Identify suppliers that are people # Set-up instructions # - See README.md for set-up instructions # 1/ Imports # 1.1/ Libraries import polars as PI_POLARS import os as PI_OS # 1.2/ Custom functions from Z_SHARED_FUNCTIONS.FC_CONFIG_LOADER import ZV_DI_VARIABLES from Z_SHARED_FUNCTIONS.FC_EXPORT import FC_EXPORT_EXCEL_POLARS from Z_SHARED_FUNCTIONS.FC_IMPORT import FC_IMPORT_XLSX from Z_SHARED_FUNCTIONS.FC_IMPORT import FC_IMPORT_TEXT from Z_SHARED_FUNCTIONS.FC_CLEAN_FIELDS import FC_CLEAN_FIELDS from Z_SHARED_FUNCTIONS.FC_CREATE_SPACY_NLP_MODEL import FC_CREATE_SPACY_NLP_MODEL from Z_SHARED_FUNCTIONS.FC_SPACY_TRAIN_NLP_MODEL import FC_SPACY_TRAIN_NLP_MODEL # 2/ Variables ZV_ST_ROOT_FOLDER = PI_OS.getenv('ZV_ST_ROOT_FOLDER') ZV_ST_SOURCES_FOLDER = PI_OS.path.join(ZV_ST_ROOT_FOLDER, '01_SOURCES') ZV_ST_RESULTS_FOLDER = PI_OS.path.join(ZV_ST_ROOT_FOLDER, '03_RESULTS') ZV_ST_MODEL_NAME = ZV_DI_VARIABLES.get('ZV_ST_MODEL_NAME') ZV_ST_SUPPLIER_FILE = ZV_DI_VARIABLES.get('ZV_ST_SUPPLIER_FILE') ZV_ST_COMPANY_WORDS_FILE = ZV_DI_VARIABLES.get('ZV_ST_COMPANY_WORDS_FILE') ZV_ST_TRAIN_DATA_FILE = ZV_DI_VARIABLES.get('ZV_ST_TRAIN_DATA_FILE') ZV_ST_RESULTS_FILE = ZV_DI_VARIABLES.get('ZV_ST_RESULTS_FILE') ZV_NU_ITERATIONS = int(ZV_DI_VARIABLES.get('ZV_NU_ITERATIONS')) ZV_BO_TRAIN_DATA = (ZV_DI_VARIABLES.get('ZV_BO_TRAIN_DATA')).lower() == 'true' def FC_P01_11_SUPPLIERS_ARE_PEOPLE(): # 1/ Import data: # 1.1/ Suppliers ZV_LI_FIELDS = [ 'Company_number', 'Company_name', 'Country', 'Tax_code', 'Country_code' ] ZV_DF_SUPPLIERS = FC_IMPORT_XLSX( ZV_ST_SUPPLIER_FILE, ZV_ST_SOURCES_FOLDER, ZV_LI_FIELDS ) # 1.2/ Company words ZV_LI_FIELDS = ['COMPANY_WORD'] ZV_DF_COMPANY_WORDS = FC_IMPORT_TEXT( ZV_ST_COMPANY_WORDS_FILE, ZV_ST_SOURCES_FOLDER, ZV_LI_FIELDS ) # 1.3/ Train data ZV_LI_FIELDS = [ 'Entity name', 'Trained', 'Entity label' ] ZV_DF_TRAINDATA = FC_IMPORT_XLSX( ZV_ST_TRAIN_DATA_FILE, ZV_ST_SOURCES_FOLDER, ZV_LI_FIELDS ) # 2/ Clean fields ZV_DF_SUPPLIERS = FC_CLEAN_FIELDS(ZV_DF_SUPPLIERS, 'Company_name') ZV_DF_COMPANY_WORDS = FC_CLEAN_FIELDS(ZV_DF_COMPANY_WORDS, 'COMPANY_WORD') ZV_DF_TRAINDATA = FC_CLEAN_FIELDS(ZV_DF_TRAINDATA,'Entity name') # 3/ Create NLP model ZV_OB_SPACY_NLP_MODEL = FC_CREATE_SPACY_NLP_MODEL(ZV_ST_SOURCES_FOLDER, ZV_ST_MODEL_NAME) # 4/ Train NLP model if ZV_BO_TRAIN_DATA: ZV_OB_SPACY_NLP_MODEL = FC_SPACY_TRAIN_NLP_MODEL(ZV_DF_TRAINDATA, ZV_OB_SPACY_NLP_MODEL, ZV_ST_SOURCES_FOLDER, ZV_NU_ITERATIONS) # 5/ Scoring using NLP model ZV_LI_LABELS = [] for ZV_ST_NAME in ZV_DF_SUPPLIERS.get_column('Company_name'): ZV_OB_SPACY_DOC = ZV_OB_SPACY_NLP_MODEL(ZV_ST_NAME) ZV_ST_LABEL = '' for ZV_OB_ENT in ZV_OB_SPACY_DOC.ents: if ZV_OB_ENT.label_ == 'ORG': ZV_ST_LABEL = 'ORG' break if ZV_ST_LABEL == '': if len(ZV_OB_SPACY_DOC.ents) > 0: ZV_ST_LABEL = ZV_OB_SPACY_DOC.ents[0].label_ else: ZV_ST_LABEL = 'UNKNOWN' ZV_LI_LABELS.append(ZV_ST_LABEL) # 6/ Add label to DataFrame ZV_DF_SUPPLIERS = ( ZV_DF_SUPPLIERS .with_columns( PI_POLARS.Series( 'ZF_ST_SPACY_LABEL', ZV_LI_LABELS ) ) ) # 7/ Overwrite label, if the company name contains a company word ZV_DF_SUPPLIERS = ( ZV_DF_SUPPLIERS .with_columns( PI_POLARS.col('Company_name') .str.split(' ') .alias('ZF_LI_COMPANY_NAME_WORD') ) .explode('ZF_LI_COMPANY_NAME_WORD') .join( ZV_DF_COMPANY_WORDS, left_on='ZF_LI_COMPANY_NAME_WORD', right_on='COMPANY_WORD', how='left', coalesce=False ) .group_by('Company_name') .agg( PI_POLARS.all().first(), PI_POLARS.col('COMPANY_WORD') .is_not_null() .any() .alias('ZF_BO_HAS_COMPANY_WORD') ) .with_columns( PI_POLARS.when( PI_POLARS.col('ZF_BO_HAS_COMPANY_WORD') ) .then( PI_POLARS.lit('ORG') ) .otherwise( PI_POLARS.col('ZF_ST_SPACY_LABEL') ) .alias('ZF_ST_SPACY_LABEL') ) .drop( [ 'ZF_LI_COMPANY_NAME_WORD', 'COMPANY_WORD', 'ZF_BO_HAS_COMPANY_WORD' ], strict=False ) ) # 10/ Export the results FC_EXPORT_EXCEL_POLARS( ZV_DF_SUPPLIERS, ZV_ST_RESULTS_FOLDER, ZV_ST_RESULTS_FILE ) if __name__ == '__main__': FC_P01_11_SUPPLIERS_ARE_PEOPLE()