Оптимизация прямоугольного среза для классификации

#python #c #linux

#python #c #linux

Вопрос:

Для классификации набор данных содержит «ЗВЕЗДУ» и «ГАЛАКТИКУ» с оптимизацией прямоугольного разреза

Я использую ANNZ (https://github.com/IftachSadeh/ANNZ ) но мне не удалось классифицировать данные.

Это код, который я использую

 from scripts.helperFuncs import *

init()

if not glob.annz["doSingleCls"]:
    log.info(red(" - " time.strftime("%d/%m/%y %H:%M:%S") " - This scripts is only designed for singleClassification...")) ; sys.exit(0)

log.info(whtOnBlck(" - " time.strftime("%d/%m/%y %H:%M:%S") " - starting ANNZ"))

############################
### VARIABLES (CHANGE 1) ###
############################
# These are a list of variables that you may need to change when you run your codes.
# Variable train: B:boolean, F:float, D:double, C:string; S:short, I:integer, L:long, US:unsigned S, UI:unsigned I, UL:unsigned L.

output_folder    = "sample_single_classification_RECTCUT_3var_star_galaxy_test2_pdf" # Output folder name. This should always be the same as the name of this file (for the annz2-clas.sh to work).
input_folder     = "travaltes"                # The folder where you put your training, validation and testiong files
signal           = "type == 3"                # signal, or the "yes" value (use == sign)
background       = "type == 6"                # background, or the "no" value
train_filename   = "train_star.txt;train_galaxy.txt" # Training set file name (multiple files allowed)
valid_filename   = "validate_star.txt;validate_galaxy.txt" # Validation set file name
eval_filename    = "test_star.txt;test_galaxy.txt"     # Testing set file name
variables_train  = "UL:objID; F:ra; I:type; C:class; F:snMedian; F:p_el; F:p_cs; F:lnLExp_r; F:lnLDev_r; F:deVMag_r; F:expMag_r; F:modelMag_r; F:cModelMag_r; F:extinction_r; F:deVAB_r; F:dered_r; F:petroR50_i; F:petroR90_i; " # The list of variables in the training and validating set
variables_eval   = "UL:objID; F:ra; I:type; C:class; F:snMedian; F:p_el; F:p_cs; F:lnLExp_r; F:lnLDev_r; F:deVMag_r; F:expMag_r; F:modelMag_r; F:cModelMag_r; F:extinction_r; F:deVAB_r; F:dered_r; F:petroR50_i; F:petroR90_i;" # The list of variables in the testing set.
num_input    = 3
variables_input  = "snMedian; deVMag_r; expMag_r;" # List of input variables you use to train
variables_output = "type"

##############################
### MLM OPTIONS (CHANGE 2) ###
##############################
# Select/edit your MLM options. comment out those that you don't want to use.

## RECTANGULAR CUTS ##
mlm_options = "ANNZ_MLM=CUTS: VarTransform=P: FitMethod=GA: EffMethod=EffPDF: CutRangeMin=-1: CutRangeMax=-1: VarProp=FSmart"

## PROJECTIVE LIKELIHOOD ESTIMATOR ##
# mlm_options = "ANNZ_MLM=Likelihood: VarTransform=P: TransformOutput=False"

#########
## KNN ##
## ScaleFrac: does best between 0.1 and 0.9, definitely not 0 or 1.
## nkNN     : depends on size of training sample: = trainsize/5000 for MGS, trainsize/1000 for LRG.
## Kernel   : polynomial(Poln) or Gaussian(Gaus)
## Trim     : default as False
# mlm_options = "ANNZ_MLM=KNN: VarTransform=P: nkNN=0.2: ScaleFrac=0.9: Kernel=Poln"

############
## PDE-RS ##
## MaxIterations : 1 iteration for every 50 training objects;
## NEventsMin    = 4x^4 - 39x^3   128x^2 - 148x   80, where x is the sample size/50000
## NEventsMax    = NEventsMin   10
# mlm_options = "ANNZ_MLM=PDERS: VarTransform=P: NEventsMin=100: NEventsMax=200: MaxVIterations=500"

##############
## PDE-FOAM ##
# mlm_options = "ANNZ_MLM=PDEFoam: VarTransform=P: SigBgSeparate=False: TailCut=0.001: VolFrac=0.5: nActiveCells=500: nSampl=2000: nBin=5: Compress=True: MultiTargetRegression=False: Nmin=100: MaxDepth=0: FillFoamWithOrigWeights=False: UseYesNoCell=False: DTLogic=None: TargetSelection=Mean"

#########
## ANN ##
## My trusted machine learning method. There are several ways to select your number of hidden layers, replace h1-5 in the code.
# h1 = str(int(2*num_input)) "," str(int(2*num_input))                             # 2N:2N (ANNz1 default)
# h2 = str(int(round((2*num_input 1)/3,0))) "," str(int(round((num_input 2)/3,0))) # (2N 1)/3:(N 2)/3 (John's default)
# h3 = str(int(round(1.5*num_input,0))) "," str(int(num_input))                    # 3N/2:N (ANNz2 default)
# h4 = str(int(num_input)) "," str(int(num_input))                                 # N:N
# h5 = str(int(2*num_input)) "," str(int(round(1.5*num_input,0)))                  # 2N:3N/2
# mlm_options = "ANNZ_MLM=ANN: VarTransform=D: TrainingMethod=BFGS: SamplingTraining=False: NeuronType=tanh: UseRegulator=False: HiddenLayers=" h2 ":RandomSeed=101979"

#########
## BDT ##
# mlm_options = "ANNZ_MLM=BDT: VarTransform=N,P: NTrees=500: BoostType=Bagging: BaggedSampleFraction=1.0: nCuts=20: MinNodeSize=0.02"


##################################################################################################################################


###########################
### PRE-PROCESSING DATA ###
###########################
glob.annz["outDirName"]   = output_folder
glob.annz["nMLMs"]        = 1
glob.annz["userCuts_sig"] = signal
glob.annz["userCuts_bck"] = background

if glob.annz["doGenInputTrees"]:
  glob.annz["inDirName"]    = input_folder
  glob.annz["inAsciiVars"]  = variables_train
  glob.annz["splitTypeTrain"] = train_filename
  glob.annz["splitTypeTest"]  = valid_filename

  runANNZ()


################
### TRAINING ###
################
if glob.annz["doTrain"]:
  for nMLMnow in range(glob.annz["nMLMs"]):
    glob.annz["nMLMnow"] = nMLMnow

    if glob.annz["trainIndex"] >= 0 and glob.annz["trainIndex"] != nMLMnow: continue

    glob.annz["rndOptTypes"]       = "BDT" # can be "ANN", "BDT" or "ANN_BDT"
    glob.annz["inputVariables"]    = variables_input

    glob.annz["userMLMopts"] = mlm_options

    runANNZ()


###################################
### OPTIMISATION AND EVALUATION ###
###################################
if glob.annz["doOptim"] or glob.annz["doEval"]:

  glob.annz["addOutputVars"] = variables_output # add variables to output file

  if glob.annz["doOptim"]:
    runANNZ()

  if glob.annz["doEval"]:
    glob.annz["inDirName"]      = input_folder
    glob.annz["inAsciiFiles"]   = eval_filename
    glob.annz["inAsciiVars"]    = variables_eval
    glob.annz["evalDirPostfix"] = ""

    runANNZ()

log.info(whtOnBlck(" - " time.strftime("%d/%m/%y %H:%M:%S") " - finished running ANNZ !"))
 

Я прочитал об оптимизации прямоугольного разреза и понял, что этот алгоритм ожидает, что входные данные будут некоррелированными. Это также не многомерный анализатор, а последовательность одномерных. Я пробовал использовать несколько разных переменных, которые не коррелируют. Это все еще не работает.

Я признателен, если кто-нибудь сможет мне помочь и указать, где я ошибся?