from spacy.training import Example as PI_SPACY_TRAINING_EXAMPLE import polars as PI_POLARS import random as PI_RANDOM import os as PI_OS def FC_SPACY_TRAIN_NLP_MODEL(ZVFCI_DF_TRAINDATA, ZVFCI_OB_SPACY_NLP_MODEL, ZVFCI_ST_SOURCES_FOLDER, ZVFCI_NU_ITERATIONS): # Model path ZV_ST_MODEL_PATH = PI_OS.path.join( ZVFCI_ST_SOURCES_FOLDER, 'ner' ) # Create list of tuples (str, dict) for each row in training data ZVFCI_DF_TRAINDATA = ( ZVFCI_DF_TRAINDATA .filter( ( PI_POLARS.col('Trained') .str.to_lowercase() ) != 'x' ) ) # Create a list of training tuples (str, dict) ZV_LI_TU_TRAINING_DATA = [] for ZV_DI_ROW in ZVFCI_DF_TRAINDATA.iter_rows(named=True): ZV_ST_NAME = str(ZV_DI_ROW['Entity name']) ZV_ST_LABEL = str(ZV_DI_ROW['Entity label']) ZV_LI_TU_TRAINING_DATA.append( ( ZV_ST_NAME, { 'entities': [ ( 0, len(ZV_ST_NAME), ZV_ST_LABEL ) ] } ) ) # Add labels if they don't exist for ZV_ST_NAME, ZV_DI_LABEL in ZV_LI_TU_TRAINING_DATA: for _, _, ZV_ST_LABEL in ZV_DI_LABEL['entities']: ZVFCI_OB_SPACY_NLP_MODEL.get_pipe('ner').add_label(ZV_ST_LABEL) # Create a list of all pipes except ner ZV_LI_OTHER_PIPES = [ ZV_ST_PIPE for ZV_ST_PIPE in ZVFCI_OB_SPACY_NLP_MODEL.pipe_names if ZV_ST_PIPE != 'ner' ] # Disable all pipes except ner- Add SpaCy docs to ner pipe. For each item in training data- (Each training data row added ZVFCI_NU_ITERATIONS times) with ZVFCI_OB_SPACY_NLP_MODEL.disable_pipes(*ZV_LI_OTHER_PIPES): ZV_OB_OPTIMIZER = ZVFCI_OB_SPACY_NLP_MODEL.resume_training() ZV_LI_DI_LOSSES = [] for i in range(ZVFCI_NU_ITERATIONS): PI_RANDOM.shuffle(ZV_LI_TU_TRAINING_DATA) ZV_DI_LOSSES = {} ZV_LI_EXAMPLES = [ PI_SPACY_TRAINING_EXAMPLE.from_dict( ZVFCI_OB_SPACY_NLP_MODEL.make_doc(ZV_ST_NAME), ZV_DI_LABEL ) for ZV_ST_NAME, ZV_DI_LABEL in ZV_LI_TU_TRAINING_DATA ] ZVFCI_OB_SPACY_NLP_MODEL.update( ZV_LI_EXAMPLES, sgd=ZV_OB_OPTIMIZER, losses=ZV_DI_LOSSES ) ZV_LI_DI_LOSSES.append(ZV_DI_LOSSES) ZVFCI_OB_SPACY_NLP_MODEL.to_disk(ZV_ST_MODEL_PATH) print( 'ner has been trained. Loss values that are low indicate that the model predicts correctly most of the time:\n' ) for ZV_NU_INDEX, ZV_DI_LOSS in enumerate(ZV_LI_DI_LOSSES, start=1): print( f"Iteration {ZV_NU_INDEX}: " f"{float(ZV_DI_LOSS.get('ner', 0))}" ) return ZVFCI_OB_SPACY_NLP_MODEL