Skip to content

Commit 9d60892

Browse files
committed
grid_submit: robustness fixes / adaption to work with pipeline runner
1 parent 61ae62b commit 9d60892

File tree

1 file changed

+24
-17
lines changed

1 file changed

+24
-17
lines changed

GRID/utils/grid_submit.sh

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ notify_mattermost() {
3131
set +x
3232
if [ "$MATTERMOSTHOOK" ]; then
3333
text=$1
34-
COMMAND="curl -X POST -H 'Content-type: application/json' --data '{\"text\":\""${text}"\"}' "${MATTERMOSTHOOK}
34+
COMMAND="curl -X POST -H 'Content-type: application/json' --data '{\"text\":\""${text}"\"}' "${MATTERMOSTHOOK}" &> /dev/null"
3535
eval "${COMMAND}"
3636
fi
3737
}
@@ -42,13 +42,13 @@ starthook() {
4242

4343
uploadlogs() {
4444
# MOMENTARILY WE ZIP ALL LOG FILES
45-
zip logs_PROCID${ALIEN_PROC_ID:-0}_failure.zip *.log* *mergerlog* *serverlog* *workerlog* alien_log_${ALIEN_PROC_ID}_failure.txt
45+
zip logs_PROCID${ALIEN_PROC_ID:-0}_failure.zip *.log* *mergerlog* *serverlog* *workerlog* alien_log_${ALIEN_PROC_ID:-0}_failure.txt
4646
[ "${ALIEN_JOB_OUTPUTDIR}" ] && upload_to_Alien logs_PROCID${ALIEN_PROC_ID:-0}_failure.zip ${ALIEN_JOB_OUTPUTDIR}/
4747
}
4848
export -f uploadlogs
4949
failhook() {
5050
notify_mattermost "${ALIEN_PROC_ID}: **Failure** in stage $2"
51-
cp alien_log_${ALIEN_PROC_ID}.txt logtmp_${ALIEN_PROC_ID}_failure.txt
51+
cp alien_log_${ALIEN_PROC_ID:-0}.txt logtmp_${ALIEN_PROC_ID:-0}_failure.txt
5252

5353
# MOMENTARILY WE ZIP ALL LOG FILES
5454
uploadlogs
@@ -121,11 +121,12 @@ checkpoint_hook_ttlbased() {
121121

122122
# analyse CPU utilization
123123
corecount=$(grep "processor" /proc/cpuinfo | wc -l)
124-
cpuusage=$(./analyse_CPU.py $2_cpuusage ${corecount})
124+
path=$PWD
125+
cpuusage=$(analyse_CPU.py $PWD/$2_cpuusage ${corecount} 2>/dev/null)
125126

126127
# analyse memory util
127-
maxmem=$(grep "PROCESS MAX MEM" $2 | awk '//{print $5}')
128-
avgmem=$(grep "PROCESS AVG MEM" $2 | awk '//{print $5}')
128+
maxmem=$(grep "PROCESS MAX MEM" ${path}/$2 | awk '//{print $5}')
129+
avgmem=$(grep "PROCESS AVG MEM" ${path}/$2 | awk '//{print $5}')
129130

130131
metrictext="#pdpmetric:${JOBLABEL},procid:${ALIEN_PROC_ID},CPU:${cpumodel},stage:$2,RC:${RC:-1},walltime:${walltime},${cpuusage},MAXMEM:${maxmem},AVGMEM:${avgmem}"
131132
notify_mattermost "${metrictext}"
@@ -174,10 +175,6 @@ export JOBUTILS_JOB_ENDHOOK=checkpoint_hook_ttlbased
174175
ONGRID=0
175176
[ "${JALIEN_TOKEN_CERT}" ] && ONGRID=1
176177

177-
# All is redirected to log.txt but kept on stdout as well
178-
if [[ $ALIEN_PROC_ID ]]; then
179-
exec &> >(tee -a alien_log_${ALIEN_PROC_ID}.txt)
180-
fi
181178

182179
JOBTTL=82000
183180
# this tells us to continue an existing job --> in this case we don't create a new workdir
@@ -334,6 +331,10 @@ if [[ "${ONGRID}" == 0 ]]; then
334331
cd "${WORKDIR}" 2> /dev/null
335332
fi
336333

334+
# All is redirected to log.txt but kept on stdout as well
335+
#if [[ $ALIEN_PROC_ID ]]; then
336+
exec &> >(tee -a alien_log_${ALIEN_PROC_ID:-0}.txt)
337+
#fi
337338

338339
# ----------- START JOB PREAMBLE -----------------------------
339340
banner "Environment"
@@ -398,11 +399,12 @@ if [ "${ONGRID}" = "1" ]; then
398399
fi
399400

400401
# ----------- DOWNLOAD ADDITIONAL HELPERS ----------------------------
401-
curl -o analyse_CPU.py https://raw.githubusercontent.com/sawenzel/AliceO2/swenzel/cpuana/Utilities/Tools/analyse_CPU.py
402+
curl -o analyse_CPU.py https://raw.githubusercontent.com/sawenzel/AliceO2/swenzel/cpuana/Utilities/Tools/analyse_CPU.py &> /dev/null
402403
chmod +x analyse_CPU.py
404+
export PATH=$PATH:$PWD
403405
export JOBUTILS_MONITORCPU=ON
404406
export JOBUTILS_WRAPPER_SLEEP=5
405-
export JOBUTILS_JOB_KILLINACTIVE=180 # kill inactive jobs after 3 minutes
407+
#export JOBUTILS_JOB_KILLINACTIVE=180 # kill inactive jobs after 3 minutes --> will be the task of pipeline runner? (or make it optional)
406408
export JOBUTILS_MONITORMEM=ON
407409

408410
# ----------- EXECUTE ACTUAL JOB ------------------------------------
@@ -411,11 +413,16 @@ chmod +x ./alien_jobscript.sh
411413
./alien_jobscript.sh
412414

413415
# just to be sure that we get the logs
414-
cp alien_log_${ALIEN_PROC_ID}.txt logtmp_${ALIEN_PROC_ID}.txt
415-
[ "${ALIEN_JOB_OUTPUTDIR}" ] && upload_to_Alien logtmp_${ALIEN_PROC_ID}.txt ${ALIEN_JOB_OUTPUTDIR}/
416-
417-
# MOMENTARILU WE ZIP ALL LOG FILES
418-
zip logs_PROCID${ALIEN_PROC_ID:-0}.zip *.log* *mergerlog* *serverlog* *workerlog* alien_log_${ALIEN_PROC_ID}.txt
416+
cp alien_log_${ALIEN_PROC_ID:-0}.txt logtmp_${ALIEN_PROC_ID:-0}.txt
417+
[ "${ALIEN_JOB_OUTPUTDIR}" ] && upload_to_Alien logtmp_${ALIEN_PROC_ID:-0}.txt ${ALIEN_JOB_OUTPUTDIR}/
418+
419+
# MOMENTARILY WE ZIP ALL LOG FILES
420+
ziparchive=logs_PROCID${ALIEN_PROC_ID:-0}.zip
421+
find ./ -name "*.log*" -exec zip ${ziparchive} {} ';'
422+
find ./ -name "*mergerlog*" -exec zip ${ziparchive} {} ';'
423+
find ./ -name "*serverlog*" -exec zip ${ziparchive} {} ';'
424+
find ./ -name "*workerlog*" -exec zip ${ziparchive} {} ';'
425+
find ./ -name "alien_log*.txt" -exec zip ${ziparchive} {} ';'
419426

420427
# We need to exit for the ALIEN JOB HANDLER!
421428
exit 0

0 commit comments

Comments
 (0)