#python #c #linux
#python #c #linux
Вопрос:
Для классификации набор данных содержит «ЗВЕЗДУ» и «ГАЛАКТИКУ» с оптимизацией прямоугольного разреза
Я использую ANNZ (https://github.com/IftachSadeh/ANNZ ) но мне не удалось классифицировать данные.
Это код, который я использую
from scripts.helperFuncs import *
init()
if not glob.annz["doSingleCls"]:
log.info(red(" - " time.strftime("%d/%m/%y %H:%M:%S") " - This scripts is only designed for singleClassification...")) ; sys.exit(0)
log.info(whtOnBlck(" - " time.strftime("%d/%m/%y %H:%M:%S") " - starting ANNZ"))
############################
### VARIABLES (CHANGE 1) ###
############################
# These are a list of variables that you may need to change when you run your codes.
# Variable train: B:boolean, F:float, D:double, C:string; S:short, I:integer, L:long, US:unsigned S, UI:unsigned I, UL:unsigned L.
output_folder = "sample_single_classification_RECTCUT_3var_star_galaxy_test2_pdf" # Output folder name. This should always be the same as the name of this file (for the annz2-clas.sh to work).
input_folder = "travaltes" # The folder where you put your training, validation and testiong files
signal = "type == 3" # signal, or the "yes" value (use == sign)
background = "type == 6" # background, or the "no" value
train_filename = "train_star.txt;train_galaxy.txt" # Training set file name (multiple files allowed)
valid_filename = "validate_star.txt;validate_galaxy.txt" # Validation set file name
eval_filename = "test_star.txt;test_galaxy.txt" # Testing set file name
variables_train = "UL:objID; F:ra; I:type; C:class; F:snMedian; F:p_el; F:p_cs; F:lnLExp_r; F:lnLDev_r; F:deVMag_r; F:expMag_r; F:modelMag_r; F:cModelMag_r; F:extinction_r; F:deVAB_r; F:dered_r; F:petroR50_i; F:petroR90_i; " # The list of variables in the training and validating set
variables_eval = "UL:objID; F:ra; I:type; C:class; F:snMedian; F:p_el; F:p_cs; F:lnLExp_r; F:lnLDev_r; F:deVMag_r; F:expMag_r; F:modelMag_r; F:cModelMag_r; F:extinction_r; F:deVAB_r; F:dered_r; F:petroR50_i; F:petroR90_i;" # The list of variables in the testing set.
num_input = 3
variables_input = "snMedian; deVMag_r; expMag_r;" # List of input variables you use to train
variables_output = "type"
##############################
### MLM OPTIONS (CHANGE 2) ###
##############################
# Select/edit your MLM options. comment out those that you don't want to use.
## RECTANGULAR CUTS ##
mlm_options = "ANNZ_MLM=CUTS: VarTransform=P: FitMethod=GA: EffMethod=EffPDF: CutRangeMin=-1: CutRangeMax=-1: VarProp=FSmart"
## PROJECTIVE LIKELIHOOD ESTIMATOR ##
# mlm_options = "ANNZ_MLM=Likelihood: VarTransform=P: TransformOutput=False"
#########
## KNN ##
## ScaleFrac: does best between 0.1 and 0.9, definitely not 0 or 1.
## nkNN : depends on size of training sample: = trainsize/5000 for MGS, trainsize/1000 for LRG.
## Kernel : polynomial(Poln) or Gaussian(Gaus)
## Trim : default as False
# mlm_options = "ANNZ_MLM=KNN: VarTransform=P: nkNN=0.2: ScaleFrac=0.9: Kernel=Poln"
############
## PDE-RS ##
## MaxIterations : 1 iteration for every 50 training objects;
## NEventsMin = 4x^4 - 39x^3 128x^2 - 148x 80, where x is the sample size/50000
## NEventsMax = NEventsMin 10
# mlm_options = "ANNZ_MLM=PDERS: VarTransform=P: NEventsMin=100: NEventsMax=200: MaxVIterations=500"
##############
## PDE-FOAM ##
# mlm_options = "ANNZ_MLM=PDEFoam: VarTransform=P: SigBgSeparate=False: TailCut=0.001: VolFrac=0.5: nActiveCells=500: nSampl=2000: nBin=5: Compress=True: MultiTargetRegression=False: Nmin=100: MaxDepth=0: FillFoamWithOrigWeights=False: UseYesNoCell=False: DTLogic=None: TargetSelection=Mean"
#########
## ANN ##
## My trusted machine learning method. There are several ways to select your number of hidden layers, replace h1-5 in the code.
# h1 = str(int(2*num_input)) "," str(int(2*num_input)) # 2N:2N (ANNz1 default)
# h2 = str(int(round((2*num_input 1)/3,0))) "," str(int(round((num_input 2)/3,0))) # (2N 1)/3:(N 2)/3 (John's default)
# h3 = str(int(round(1.5*num_input,0))) "," str(int(num_input)) # 3N/2:N (ANNz2 default)
# h4 = str(int(num_input)) "," str(int(num_input)) # N:N
# h5 = str(int(2*num_input)) "," str(int(round(1.5*num_input,0))) # 2N:3N/2
# mlm_options = "ANNZ_MLM=ANN: VarTransform=D: TrainingMethod=BFGS: SamplingTraining=False: NeuronType=tanh: UseRegulator=False: HiddenLayers=" h2 ":RandomSeed=101979"
#########
## BDT ##
# mlm_options = "ANNZ_MLM=BDT: VarTransform=N,P: NTrees=500: BoostType=Bagging: BaggedSampleFraction=1.0: nCuts=20: MinNodeSize=0.02"
##################################################################################################################################
###########################
### PRE-PROCESSING DATA ###
###########################
glob.annz["outDirName"] = output_folder
glob.annz["nMLMs"] = 1
glob.annz["userCuts_sig"] = signal
glob.annz["userCuts_bck"] = background
if glob.annz["doGenInputTrees"]:
glob.annz["inDirName"] = input_folder
glob.annz["inAsciiVars"] = variables_train
glob.annz["splitTypeTrain"] = train_filename
glob.annz["splitTypeTest"] = valid_filename
runANNZ()
################
### TRAINING ###
################
if glob.annz["doTrain"]:
for nMLMnow in range(glob.annz["nMLMs"]):
glob.annz["nMLMnow"] = nMLMnow
if glob.annz["trainIndex"] >= 0 and glob.annz["trainIndex"] != nMLMnow: continue
glob.annz["rndOptTypes"] = "BDT" # can be "ANN", "BDT" or "ANN_BDT"
glob.annz["inputVariables"] = variables_input
glob.annz["userMLMopts"] = mlm_options
runANNZ()
###################################
### OPTIMISATION AND EVALUATION ###
###################################
if glob.annz["doOptim"] or glob.annz["doEval"]:
glob.annz["addOutputVars"] = variables_output # add variables to output file
if glob.annz["doOptim"]:
runANNZ()
if glob.annz["doEval"]:
glob.annz["inDirName"] = input_folder
glob.annz["inAsciiFiles"] = eval_filename
glob.annz["inAsciiVars"] = variables_eval
glob.annz["evalDirPostfix"] = ""
runANNZ()
log.info(whtOnBlck(" - " time.strftime("%d/%m/%y %H:%M:%S") " - finished running ANNZ !"))
Я прочитал об оптимизации прямоугольного разреза и понял, что этот алгоритм ожидает, что входные данные будут некоррелированными. Это также не многомерный анализатор, а последовательность одномерных. Я пробовал использовать несколько разных переменных, которые не коррелируют. Это все еще не работает.
Я признателен, если кто-нибудь сможет мне помочь и указать, где я ошибся?