Evaluation on test set generates F1 score more than 1

Hi there,

I am evaluating the GAT model with a separated test set by the matrices AUC and F1 Score. However, while the AUC seems to be ok, the F1 score is always higher than 1 ~ in this case 2.4. Could someone kindly help with great thanks !!! :smiling_face_with_three_hearts:

def test_eval(args, n_classes):

    num_classes = n_classes
    window_size = args.window_size
    print('building graphs for training')
    x_adj, x_feature = build_graph(start=len(traindf), end=len(traindf) + len(testdf), weighted_graph = True)
    test_label = testdf['target'].values   

    testdataset = GraphDataset(x_adj, x_feature,test_label)
    testloader = GraphDataLoader(testdataset, batch_size = args.batch_size, shuffle = False)
    
    model = GATClassifier(args.embedding_dim, args.hidden_dim, args.num_heads, num_classes)
    model_list = load_models(model, args.n_folds)
        
    test_f1 = 0
    test_auc = 0
    
    all_labels = []
    all_logits = []
    
    total = len(testloader) 
    model.eval() 
    
    with torch.no_grad():
        for idx, (G,label) in tqdm(enumerate(testloader),  total = total):
            h = G.ndata['feat'].float()
            logit = 0
            for mod in model_list:
                log = mod(G, h)
                logit += log.softmax(-1) / args.n_folds
    
                label_numpy = label.detach().cpu().numpy()
                logit_numpy = logit.softmax(-1).detach().cpu().numpy()

                test_f1 = sklearn.metrics.f1_score(label_numpy, logit_numpy.argmax(-1), average = 'micro')/total
    
        
                all_labels.append(label_numpy)
                all_logits.append(logit_numpy)

        all_labels = np.concatenate(all_labels)
        all_logits = np.concatenate(all_logits)
        
        test_auc = sklearn.metrics.roc_auc_score(all_labels, all_logits, multi_class = 'ovo', labels = np.array([int(i) for i in range(num_classes)]))


    
    return print("F1_score: ",test_f1, "AUC: ", test_auc)

It looks like you are evaluating multiple models at the same time. Maybe first take a look at the F1 and AUC score of each model?

This topic was automatically closed 30 days after the last reply. New replies are no longer allowed.