@@ -31,7 +31,7 @@ notify_mattermost() {
3131 set +x
3232 if [ " $MATTERMOSTHOOK " ]; then
3333 text=$1
34- COMMAND=" curl -X POST -H 'Content-type: application/json' --data '{\" text\" :\" " ${text} " \" }' " ${MATTERMOSTHOOK}
34+ COMMAND=" curl -X POST -H 'Content-type: application/json' --data '{\" text\" :\" " ${text} " \" }' " ${MATTERMOSTHOOK} " &> /dev/null "
3535 eval " ${COMMAND} "
3636 fi
3737}
@@ -42,13 +42,13 @@ starthook() {
4242
4343uploadlogs () {
4444 # MOMENTARILY WE ZIP ALL LOG FILES
45- zip logs_PROCID${ALIEN_PROC_ID:- 0} _failure.zip * .log* * mergerlog* * serverlog* * workerlog* alien_log_${ALIEN_PROC_ID} _failure.txt
45+ zip logs_PROCID${ALIEN_PROC_ID:- 0} _failure.zip * .log* * mergerlog* * serverlog* * workerlog* alien_log_${ALIEN_PROC_ID:- 0 } _failure.txt
4646 [ " ${ALIEN_JOB_OUTPUTDIR} " ] && upload_to_Alien logs_PROCID${ALIEN_PROC_ID:- 0} _failure.zip ${ALIEN_JOB_OUTPUTDIR} /
4747}
4848export -f uploadlogs
4949failhook () {
5050 notify_mattermost " ${ALIEN_PROC_ID} : **Failure** in stage $2 "
51- cp alien_log_${ALIEN_PROC_ID} .txt logtmp_${ALIEN_PROC_ID} _failure.txt
51+ cp alien_log_${ALIEN_PROC_ID:- 0 } .txt logtmp_${ALIEN_PROC_ID:- 0 } _failure.txt
5252
5353 # MOMENTARILY WE ZIP ALL LOG FILES
5454 uploadlogs
@@ -121,11 +121,12 @@ checkpoint_hook_ttlbased() {
121121
122122 # analyse CPU utilization
123123 corecount=$( grep " processor" /proc/cpuinfo | wc -l)
124- cpuusage=$( ./analyse_CPU.py $2 _cpuusage ${corecount} )
124+ path=$PWD
125+ cpuusage=$( analyse_CPU.py $PWD /$2 _cpuusage ${corecount} 2> /dev/null)
125126
126127 # analyse memory util
127- maxmem=$( grep " PROCESS MAX MEM" $2 | awk ' //{print $5}' )
128- avgmem=$( grep " PROCESS AVG MEM" $2 | awk ' //{print $5}' )
128+ maxmem=$( grep " PROCESS MAX MEM" ${path} / $ 2 | awk ' //{print $5}' )
129+ avgmem=$( grep " PROCESS AVG MEM" ${path} / $ 2 | awk ' //{print $5}' )
129130
130131 metrictext=" #pdpmetric:${JOBLABEL} ,procid:${ALIEN_PROC_ID} ,CPU:${cpumodel} ,stage:$2 ,RC:${RC:- 1} ,walltime:${walltime} ,${cpuusage} ,MAXMEM:${maxmem} ,AVGMEM:${avgmem} "
131132 notify_mattermost " ${metrictext} "
@@ -174,10 +175,6 @@ export JOBUTILS_JOB_ENDHOOK=checkpoint_hook_ttlbased
174175ONGRID=0
175176[ " ${JALIEN_TOKEN_CERT} " ] && ONGRID=1
176177
177- # All is redirected to log.txt but kept on stdout as well
178- if [[ $ALIEN_PROC_ID ]]; then
179- exec & > >( tee -a alien_log_${ALIEN_PROC_ID} .txt)
180- fi
181178
182179JOBTTL=82000
183180# this tells us to continue an existing job --> in this case we don't create a new workdir
@@ -334,6 +331,10 @@ if [[ "${ONGRID}" == 0 ]]; then
334331 cd " ${WORKDIR} " 2> /dev/null
335332fi
336333
334+ # All is redirected to log.txt but kept on stdout as well
335+ # if [[ $ALIEN_PROC_ID ]]; then
336+ exec & > >( tee -a alien_log_${ALIEN_PROC_ID:- 0} .txt)
337+ # fi
337338
338339# ----------- START JOB PREAMBLE -----------------------------
339340banner " Environment"
@@ -398,11 +399,12 @@ if [ "${ONGRID}" = "1" ]; then
398399fi
399400
400401# ----------- DOWNLOAD ADDITIONAL HELPERS ----------------------------
401- curl -o analyse_CPU.py https://raw.githubusercontent.com/sawenzel/AliceO2/swenzel/cpuana/Utilities/Tools/analyse_CPU.py
402+ curl -o analyse_CPU.py https://raw.githubusercontent.com/sawenzel/AliceO2/swenzel/cpuana/Utilities/Tools/analyse_CPU.py & > /dev/null
402403chmod +x analyse_CPU.py
404+ export PATH=$PATH :$PWD
403405export JOBUTILS_MONITORCPU=ON
404406export JOBUTILS_WRAPPER_SLEEP=5
405- export JOBUTILS_JOB_KILLINACTIVE=180 # kill inactive jobs after 3 minutes
407+ # export JOBUTILS_JOB_KILLINACTIVE=180 # kill inactive jobs after 3 minutes --> will be the task of pipeline runner? (or make it optional)
406408export JOBUTILS_MONITORMEM=ON
407409
408410# ----------- EXECUTE ACTUAL JOB ------------------------------------
@@ -411,11 +413,16 @@ chmod +x ./alien_jobscript.sh
411413./alien_jobscript.sh
412414
413415# just to be sure that we get the logs
414- cp alien_log_${ALIEN_PROC_ID} .txt logtmp_${ALIEN_PROC_ID} .txt
415- [ " ${ALIEN_JOB_OUTPUTDIR} " ] && upload_to_Alien logtmp_${ALIEN_PROC_ID} .txt ${ALIEN_JOB_OUTPUTDIR} /
416-
417- # MOMENTARILU WE ZIP ALL LOG FILES
418- zip logs_PROCID${ALIEN_PROC_ID:- 0} .zip * .log* * mergerlog* * serverlog* * workerlog* alien_log_${ALIEN_PROC_ID} .txt
416+ cp alien_log_${ALIEN_PROC_ID:- 0} .txt logtmp_${ALIEN_PROC_ID:- 0} .txt
417+ [ " ${ALIEN_JOB_OUTPUTDIR} " ] && upload_to_Alien logtmp_${ALIEN_PROC_ID:- 0} .txt ${ALIEN_JOB_OUTPUTDIR} /
418+
419+ # MOMENTARILY WE ZIP ALL LOG FILES
420+ ziparchive=logs_PROCID${ALIEN_PROC_ID:- 0} .zip
421+ find ./ -name " *.log*" -exec zip ${ziparchive} {} ' ;'
422+ find ./ -name " *mergerlog*" -exec zip ${ziparchive} {} ' ;'
423+ find ./ -name " *serverlog*" -exec zip ${ziparchive} {} ' ;'
424+ find ./ -name " *workerlog*" -exec zip ${ziparchive} {} ' ;'
425+ find ./ -name " alien_log*.txt" -exec zip ${ziparchive} {} ' ;'
419426
420427# We need to exit for the ALIEN JOB HANDLER!
421428exit 0
0 commit comments