Commit a4afba6c authored by david.singh's avatar david.singh
Browse files

Add deployment script improvements.

parent 8b3c7736
Showing with 139 additions and 113 deletions
+139 -113
......@@ -106,8 +106,8 @@ int get_number_of_active_nodes()
else
{
number_active_storage_servers = atoi(buf);
fprintf(stderr, "[Wake up server] The new number of active data nodes is %s\n", buf);
slog_debug("[Wake up server] The new number of active data nodes is %s\n", buf);
fprintf(stderr, "[Server] The new number of active data nodes is %s\n", buf);
slog_debug("[Server] The new number of active data nodes is %s\n", buf);
}
// Close the file.
ret = close(fd);
......@@ -280,7 +280,7 @@ void handle_signal_server(int signal)
char buf[10], action[20];
;
// Get the operation number.
int fd = open("/tmp/hercules_pkill_operation", O_RDONLY);
int fd = open("./tmp/hercules_pkill_operation", O_RDONLY);
if (fd == -1)
{
perror("ERR_HERCULES_OPEN_PKILL_OPERATION");
......@@ -347,7 +347,7 @@ void handle_signal_server(int signal)
// This file is readed by the hercules script to know if this server
// was correctly shutting down.
char tmp_file_path[100];
sprintf(tmp_file_path, "/tmp/%c-hercules-%d-%s", args.type, args.id, action);
sprintf(tmp_file_path, "%s/tmp/%c-hercules-%d-%s", args.hercules_path, args.type, args.id, action);
ready(tmp_file_path, "OK");
}
if (signal == SIGUSR2) // wake up this server.
......@@ -363,7 +363,8 @@ void handle_signal_server(int signal)
// This file is readed by the hercules script to know if this server
// was correctly waking up.
char tmp_file_path[100];
sprintf(tmp_file_path, "/tmp/%c-hercules-%d-up", args.type, args.id);
sprintf(tmp_file_path, "%s/tmp/%c-hercules-%d-up", args.hercules_path, args.type, args.id);
fprintf(stderr, "Writting file %s\n", tmp_file_path);
ready(tmp_file_path, "OK");
}
}
......@@ -451,7 +452,6 @@ int32_t main(int32_t argc, char **argv)
return 0;
}
sprintf(tmp_file_path, "/tmp/%c-hercules-%d-start", args.type, args.id);
cfg = cfg_init();
conf_path = getenv("HERCULES_CONF");
......@@ -515,6 +515,12 @@ int32_t main(int32_t argc, char **argv)
// fprintf(stderr, "Configuration file loaded: %s\n", conf_path);
}
if (getenv("HERCULES_PATH") != NULL)
strcpy(args.hercules_path, getenv("HERCULES_PATH"));
else if (cfg_get(cfg, "HERCULES_PATH"))
strcpy(args.hercules_path, cfg_get(cfg, "HERCULES_PATH"));
if (cfg_get(cfg, "URI"))
{
aux = cfg_get(cfg, "URI");
......@@ -624,6 +630,7 @@ int32_t main(int32_t argc, char **argv)
}
}
// IMSS_DEBUG_LEVEL = SLOG_NONE;
sprintf(tmp_file_path, "%s/tmp/%c-hercules-%d-start", args.hercules_path, args.type, args.id);
/***************************************************************/
/******************** PARSE INPUT ARGUMENTS ********************/
......@@ -640,7 +647,6 @@ int32_t main(int32_t argc, char **argv)
slog_debug("Server type=%c\n", args.type);
struct tm tm = *localtime(&t);
sprintf(log_path, "./%c-server-%d.%02d-%02d-%02d", args.type, args.id, tm.tm_hour, tm.tm_min, tm.tm_sec);
// sprintf(log_path, "./%c-server", args.type);
slog_init(log_path, IMSS_DEBUG_LEVEL, IMSS_DEBUG_FILE, IMSS_DEBUG_SCREEN, 1, 1, 1, args.id);
if (IMSS_DEBUG_FILE > 0)
......@@ -1202,7 +1208,7 @@ int32_t main(int32_t argc, char **argv)
// sleep(1);
// char tmp_file_path[100];
// sprintf(tmp_file_path, "/tmp/%c-hercules-%d-down", args.type, args.id);
// sprintf(tmp_file_path, "./tmp/%c-hercules-%d-down", args.type, args.id);
// stop_server();
// move_blocks_2_server(args.stat_port, args.id, imss_uri, g_map);
......@@ -1226,7 +1232,7 @@ int32_t main(int32_t argc, char **argv)
// ep_close(ucp_worker, pub_ep, UCP_EP_CLOSE_MODE_FORCE);
// ep_close(ucp_worker, client_ep, UCP_EP_CLOSE_MODE_FORCE);
sprintf(tmp_file_path, "/tmp/%c-hercules-%d-stop", args.type, args.id);
sprintf(tmp_file_path, "%s/tmp/%c-hercules-%d-stop", args.hercules_path, args.type, args.id);
ready(tmp_file_path, "OK");
// Free the memory buffer.
......
......@@ -8,7 +8,7 @@ BLOCK_SIZE = 512
MOUNT_POINT = /mnt/hercules/
# Path where the Hercules project is located
HERCULES_PATH = /beegfs/home/javier.garciablas/hercules
HERCULES_PATH = /beegfs/home/david.singh/hercules
# Port listening in the metadata node service
METADATA_PORT = 75000
......@@ -35,9 +35,10 @@ ATTACHED = 0
MALLEABILITY = 0
UPPER_BOUND_MALLEABILITY = 0
LOWER_BOUND_MALLEABILITY = 0
INIT_NUM_DATA_SERVERS = 1
# File containing a list of nodes serving as data nodes
DATA_HOSTFILE = data_hostfile
DATA_HOSTFILE = /beegfs/home/david.singh/EpiGraphFlexMPI/data_hostfile
# Number of threads attending data requests
THREAD_POOL = 1
......@@ -46,7 +47,7 @@ THREAD_POOL = 1
STORAGE_SIZE = 1 # No limit
# File containing a list of nodes serving as metadata nodes
METADATA_HOSTFILE = meta_hostfile
METADATA_HOSTFILE = /beegfs/home/david.singh/EpiGraphFlexMPI/meta_hostfile
# Replication factor (1, 2 or 3)
REPL_FACTOR = 1
......
......@@ -36,6 +36,7 @@ struct arguments
uint64_t port; /* port arg to '-p' */
int64_t bufsize; /* buffer size arg to '-b' */
char imss_uri[32]; /* IMSS URI arg to '-i' */
char hercules_path[PATH_MAX]; /* hercules path */
char * stat_host; /* Metadata server hostname arg to '-H' */
int64_t stat_port; /* Metadata server port number arg to '-P' */
int64_t num_servers; /* number of data servers arg to '-n' */
......
......@@ -3,13 +3,13 @@
SERVER_TYPE=$1
SERVER_NUMBER=$2
ACTION=$3 # expected string action, e.g., down when servers are stopped.
ATTEMPS=60
ATTEMPS=10
i=1
FILE="/tmp/$SERVER_TYPE-hercules-$SERVER_NUMBER-$ACTION"
FILE="./tmp/$SERVER_TYPE-hercules-$SERVER_NUMBER-$ACTION"
## Checks if the file exists.
until [ -f $FILE ]; do
# echo "Waiting for $FILE, attemp $i"
echo "Waiting for $FILE, attemp $i"
i=$(($i + 1))
## Waits "attemps" times, then an error is return.
if [ $i -gt $ATTEMPS ]; then
......@@ -23,8 +23,12 @@ until [ -f $FILE ]; do
done
## Checks if the server was deploy correctly.
STATUS=$(cat $FILE | grep "STATUS" | awk '{print $3}')
STATUS=$(cat -- $FILE | grep "STATUS" | awk '{print $3}')
echo "STATUS=$STATUS"
## Removes the file.
set -x
rm ${FILE}
set +x
if [ "$STATUS" != "OK" ]; then
# echo "[X] Error deploying server $SERVER_NUMBER."
exit 1
......
#!/bin/bash
## Last modification: 06/09/2024
## Last modification: 18/11/2024
## Genaro Sanchez-Gallegos
# set -x
#set -x
#######################################################
#######################################################
## To stop servers.
StopServers() {
NAME=$1 # Server type ("metadata" or "data").
hosts=$2 # List of hostnames.
shift
# Operation to be apply to the server.
# 0 indicates the server will be killed, and
# 1 indicates the sever will be suspend
# (e.g., Hercules shrink when malleability is enabled).
OPERATION=$3
OPERATION=$1
shift
hosts=("$@") # List of hostnames.
echo "# Hercules: Stopping $NAME servers in ${hosts[@]}"
if [[ "$VERBOSE" -eq "1" ]]; then
......@@ -22,7 +24,8 @@ StopServers() {
for node in "${hosts[@]}"
do
# Set the action to be doing by the servers when they received the pkill signal.
( ssh $node "echo $OPERATION > /tmp/hercules_pkill_operation" )
# ( ssh $node "echo $OPERATION > ./tmp/hercules_pkill_operation" )
echo $OPERATION > "${HERCULES_PATH}/tmp/hercules_pkill_operation"
# Kill threads and finish the server.
( ssh $node "pkill -SIGUSR1 hercules_server" )
done
......@@ -30,7 +33,9 @@ StopServers() {
WakeUpServers() {
NAME=$1 # Server type ("meta" or "data").
hosts=$2 # List of hostnames.
shift
hosts=("$@") # List of hostnames.
# hosts=$2 # List of hostnames.
# Wake up desired servers.
for node in "${hosts[@]}"
......@@ -38,8 +43,9 @@ WakeUpServers() {
### echo "ssh $node pkill hercules_server"
( ssh $node "pkill -SIGUSR2 hercules_server" )
done
awk 'FNR==NR{a[$0]=NR;next}{print a[$0]}' data_hostfile data2start_hostfile > data2up_index
set -x
awk 'FNR==NR{a[$0]=NR;next}{print a[$0]}' ${HERCULES_DATA_HOSTFILE} ${HERCULES_DATA_HOSTFILE_PROV} > ${HERCULES_PATH}/tmp/data2up_index_$SLURM_JOB_ID
set +x
WaitForServers "data" "d" "up" ${hosts[@]}
}
......@@ -60,70 +66,44 @@ WaitForServers() {
shift
hosts=("$@")
file_name="data2${ACTION}_index"
readarray -t server_index_arr < $file_name #data2start_index
file_name="${HERCULES_PATH}/tmp/data2${ACTION}_index_$SLURM_JOB_ID"
readarray -t server_index_arr < $file_name #data2start_index-<SLURM-JOD-ID>
# echo "hostnames=${hosts[@]}"
ATTEMPS=1000
for node in "${hosts[@]}"
do
# FILE="/tmp/$SERVER_TYPE-hercules-$((server_index_arr[$SERVER_ID]-1))-$ACTION"
SERVER_NUMBER=$((server_index_arr[$SERVER_ID]-1))
if [[ "$VERBOSE" -eq "1" ]]; then
echo "# Hercules: $ACTION $SERVER_NAME server $SERVER_NUMBER on $node"
fi
COMMAND="$HERCULES_BASH_PATH/check-servers.sh $SERVER_TYPE $SERVER_NUMBER $ACTION"
### echo "[+] Running comprobation in $node... $COMMAND"
# if slurm is not available.
if [[ "$SLURM" -eq "0" ]]; then
if [ -z "$(ssh-keygen -F $node)" ]; then
ssh-keyscan -H $node >> ~/.ssh/known_hosts
fi
if [[ "$VERBOSE" -eq "1" ]]; then
echo "Running ssh $node $COMMAND"
echo "Running ssh $node -- $COMMAND"
fi
ret=$(ssh $node "$COMMAND")
ret=$(ssh $node -- "$COMMAND")
if [[ "$VERBOSE" -eq "1" ]]; then
echo "Return code $ret"
fi
else # if slurm is available.
ret=$(srun -N 1 -n 1 -c 1 -m block:block:block --mem=1G -w $node $COMMAND)
set -x
ret=$(srun -N 1 -n 1 -c 1 -m block:block:block --mem=1G -w $node -- $COMMAND)
set +x
fi
ret=$?
if [ "$ret" -gt 0 ]; then
echo "[Error: $ret] It has not been possible to "${ACTION}" a $SERVER_NAME server on $node, please verify the configuration file and logs."
#StopServers $SERVER_NAME $hosts 1
exit 1
fi
## Checks if the file exists.
## until [ -f $FILE ]; do
# ret=0
# until [ "$ret" -gt 0 ]; do
# # echo "Waiting for $FILE, attemp $i"
# i=$(($i + 1))
# ## Waits "attemps" times, then an error is return.
# if [ $i -gt $ATTEMPS ]; then
# exit 1
# fi
# t=$(($i % 5))
# if [ $t -eq 0 ]; then
# echo "[+][$HOSTNAME] Waiting for server $((server_index_arr[$SERVER_ID]-1)), $FILE"
# fi
# sleep 1
# done
# ## Checks if the server was deploy correctly.
# STATUS=$(cat $FILE | grep "STATUS" | awk '{print $3}')
# if [ "$STATUS" != "OK" ]; then
# # echo "[X] Error deploying server $SERVER_NUMBER."
# exit 1
# fi
SERVER_ID=$((SERVER_ID+1))
if [[ "$VERBOSE" -eq "1" ]]; then
echo "[OK] $SERVER_NAME $i server running in $node"
fi
fi
done
}
......@@ -246,7 +226,7 @@ fi
if [[ "$VERBOSE" -eq "1" ]]; then
echo "Reading configuration from $FILE"
fi
export HERCULES_CONF="$FILE"
#export HERCULES_CONF="$FILE"
## Checks if a configuration file was set.
......@@ -293,11 +273,6 @@ if [[ "$SLURM" -eq "0" && ! $STATUS = "stop" ]]; then
# SERVERS_HOSTFILES_OPTION=1
#SERVERS_HOSTFILES_OPTION=$(($SERVERS_HOSTFILES_OPTION + 1))
fi
# if a server hostfile is missing.
#if [ ! "$SERVERS_HOSTFILES_OPTION" -eq "2" ]; then
# echo "One or more hostfiles missing for the servers, please specify them with: -m <metadata_hostfile> -d <data_hostfile>"
# exit 0
#fi
fi
......@@ -325,7 +300,6 @@ if [[ "$VERBOSE" -eq "1" ]]; then
echo "Hercules path - $HERCULES_PATH"
fi
## Read configuration file.
export HERCULES_MOUNT_POINT=$(cat $FILE | grep "\<MOUNT_POINT\>" | head -1 | awk '{print $3}')
META_PORT=$(cat $FILE | grep "\<METADATA_PORT\>" | awk '{print $3}')
......@@ -347,8 +321,20 @@ if [ -z "$INIT_HERCULES_NUM_DATA" ] || [ "$INIT_HERCULES_NUM_DATA" = "0" ] ; the
echo "INIT_HERCULES_NUM_DATA = $INIT_HERCULES_NUM_DATA"
fi
# set -x
if [[ "$SLURM" -eq "1" ]]; then
## If slurm is enabled, we concat the job id to the configuration file to create a new one.
CURR_HERCULES_CONF_FILE="${FILE}_${SLURM_JOB_ID}"
cp $FILE $CURR_HERCULES_CONF_FILE
else
CURR_HERCULES_CONF_FILE="${FILE}"
fi
export HERCULES_CONF=$CURR_HERCULES_CONF_FILE
# exit 0
# echo "+ + + Initial number of data nodes is $INIT_HERCULES_NUM_DATA/$HERCULES_NUM_DATA"
export HERCULES_INIT_NUM_DATA=$INIT_HERCULES_NUM_DATA
export "HERCULES_INIT_NUM_DATA=${INIT_HERCULES_NUM_DATA}"
## \< \> to match exact word.
......@@ -374,19 +360,19 @@ then
## the hercules "remove" option to stop only some
## data servers. "stop" option will stop all running
## data servers.
cat $HERCULES_DATA_HOSTFILE > data2stop_hostfile
cat $HERCULES_DATA_HOSTFILE > "data2stop_hostfile_$SLURM_JOB_ID"
## Creates an array with the data servers hostnames.
readarray -t hosts < $HERCULES_DATA_HOSTFILE
## Stop the data servers.
StopServers "data" ${hosts[@]} 1
StopServers "data" 1 ${hosts[@]}
## Checks if user wants to wait until all servers are stopped.
# 1 indicates that this function should check if
# all servers have stopped correctly (synchronous).
# 0 in other case (asynchronous).
if [[ "$WAIT_SERVERS" -eq "1" ]]; then
awk 'FNR==NR{a[$0]=NR;next}{print a[$0]}' data_hostfile data_hostfile > "data2${STATUS}_index"
awk 'FNR==NR{a[$0]=NR;next}{print a[$0]}' ${HERCULES_DATA_HOSTFILE} ${HERCULES_DATA_HOSTFILE} > "${HERCULES_PATH}/tmp/data2${STATUS}_index_$SLURM_JOB_ID"
WaitForServers "data" "d" "${STATUS}" ${hosts[@]}
else
echo "No waiting for data servers to be stopped."
......@@ -398,17 +384,17 @@ then
exit 0
fi
# Stop metadata servers.
cat $HERCULES_METADATA_HOSTFILE > data2stop_hostfile
cat $HERCULES_METADATA_HOSTFILE > "data2stop_hostfile_$SLURM_JOB_ID"
readarray -t hosts < $HERCULES_METADATA_HOSTFILE
# echo "meta hosts=$hosts"
StopServers "metadata" ${hosts[@]} 1
StopServers "metadata" 1 ${hosts[@]}
## Checks if user wants to wait until all servers are stopped.
# 1 indicates that this function should check if
# all servers have stopped correctly (synchronous).
# 0 in other case (asynchronous).
if [[ "$WAIT_SERVERS" -eq "1" ]]; then
awk 'FNR==NR{a[$0]=NR;next}{print a[$0]}' meta_hostfile meta_hostfile > "data2${STATUS}_index"
awk 'FNR==NR{a[$0]=NR;next}{print a[$0]}' ${HERCULES_METADATA_HOSTFILE} ${HERCULES_METADATA_HOSTFILE} > "${HERCULES_PATH}/tmp/data2${STATUS}_index"
WaitForServers "metadata" "m" "${STATUS}" ${hosts[@]}
else
echo "No waiting for data servers to be stopped."
......@@ -431,14 +417,14 @@ then
echo "Data server file " $DATA_SERVER_FILE " does not exists"
exit 0
fi
HERCULES_DATA_HOSTFILE=$DATA_SERVER_FILE
HERCULES_DATA_HOSTFILE_PROV=$DATA_SERVER_FILE
else
echo "No data server file was provided to add more servers."
exit 0
fi
readarray -t hosts < $HERCULES_DATA_HOSTFILE
readarray -t hosts < $HERCULES_DATA_HOSTFILE_PROV
# echo "data hosts=$hosts"
WakeUpServers "data" $hosts
WakeUpServers "data" ${hosts[@]}
# sleep 10
exit 0
fi
......@@ -466,11 +452,12 @@ then
readarray -t hosts < $HERCULES_DATA_HOSTFILE
# 1 = Stops some servers synchronously.
# 0 = Do not kill the process.
StopServers "data" ${hosts[@]} 0
StopServers "data" 0 ${hosts[@]}
# For "remove" operations we awalys wait for selected
# servers to be removed.
awk 'FNR==NR{a[$0]=NR;next}{print a[$0]}' data_hostfile data_hostfile > "data2${STATUS}_index"
awk 'FNR==NR{a[$0]=NR;next}{print a[$0]}' $HERCULES_DATA_HOSTFILE $HERCULES_DATA_HOSTFILE > "${HERCULES_PATH}/tmp/data2${STATUS}_index_$SLURM_JOB_ID"
WaitForServers "data" "d" "${STATUS}" ${hosts[@]}
# Metadata server is not stopped.
......@@ -478,6 +465,37 @@ then
exit 0
fi
## If not empty, the field "METADATA_HOSTFILE" and "DATA_HOSTFILE" field was set on the configuration file.
if [ ! -z "$HERCULES_METADATA_HOSTFILE" ]; then
if [[ "$SLURM" -eq "1" ]]; then
# If slurm is enabled, we added the slurm job id to the meta hostfile.
HERCULES_METADATA_HOSTFILE+="_"$SLURM_JOB_ID
## Replace all "/" with "\/" because "/" is an special character for the sed command.
ESCAPED_HERCULES_METADATA_HOSTFILE="${HERCULES_METADATA_HOSTFILE//\//\\/}"
## Replace the old metadata hostfile name with the new one containing the job id.
sed -i "s/^METADATA_HOSTFILE = .*/METADATA_HOSTFILE = $ESCAPED_HERCULES_METADATA_HOSTFILE/g" "$CURR_HERCULES_CONF_FILE"
fi
fi
if [ ! -z "$HERCULES_DATA_HOSTFILE" ]; then
if [[ "$SLURM" -eq "1" ]]; then
# If slurm is enabled, we added the slurm job id to the data hostfile.
HERCULES_DATA_HOSTFILE+="_"$SLURM_JOB_ID
## Replace all "/" with "\/" because "/" is an special character for the sed command.
ESCAPED_HERCULES_DATA_HOSTFILE="${HERCULES_DATA_HOSTFILE//\//\\/}"
## Replace the old data hostfile name with the new one containing the job id.
sed -i "s/^DATA_HOSTFILE = .*/DATA_HOSTFILE = $ESCAPED_HERCULES_DATA_HOSTFILE/g" "$CURR_HERCULES_CONF_FILE"
fi
fi
## Creates the initial "hercules_num_act_nodes" file.
echo $HERCULES_INIT_NUM_DATA > hercules_num_act_nodes
## Set the name that will be used to create the
if [[ "$SLURM" -eq "1" ]]; then
FULL_HOSTFILE="hostfile_$SLURM_JOB_ID"
else
FULL_HOSTFILE="hostfile"
fi
## If hostfiles was not set, then we create a hostfile containing the allocated nodes.
if [[ "$VERBOSE" -eq "1" ]]; then
......@@ -485,7 +503,7 @@ if [[ "$VERBOSE" -eq "1" ]]; then
fi
if [[ $SERVERS_HOSTFILES_OPTION -eq "0" ]]; then
#srun -pernode hostname |sort > hostfile
scontrol show hostnames "$SLURM_JOB_NODELIST" > hostfile
scontrol show hostnames "$SLURM_JOB_NODELIST" > $FULL_HOSTFILE
# scontrol show hostnames "$SLURM_JOB_NODELIST"
fi
......@@ -501,11 +519,11 @@ if [ -z "$HERCULES_MPI_HOSTFILE_NAME" ]; then
fi
## Creates a client hostfile using some of the allocates nodes.
if [[ $SLURM -eq "1" ]]; then
HERCULES_MPI_HOSTFILE_NAME="client_hostfile"
HERCULES_MPI_HOSTFILE_NAME="client_hostfile_"$SLURM_JOB_ID
if [[ $ATTACHED -eq "1" ]]; then
tail -n +$((HERCULES_NUM_METADATA+1)) hostfile | head -n $NUM_NODES_FOR_CLIENTS > $HERCULES_MPI_HOSTFILE_NAME
tail -n +$((HERCULES_NUM_METADATA+1)) $FULL_HOSTFILE | head -n $NUM_NODES_FOR_CLIENTS > $HERCULES_MPI_HOSTFILE_NAME
else
tail -n +$((HERCULES_NUM_METADATA+HERCULES_NUM_DATA+1)) hostfile | head -n $NUM_NODES_FOR_CLIENTS > $HERCULES_MPI_HOSTFILE_NAME
tail -n +$((HERCULES_NUM_METADATA+HERCULES_NUM_DATA+1)) $FULL_HOSTFILE | head -n $NUM_NODES_FOR_CLIENTS > $HERCULES_MPI_HOSTFILE_NAME
fi
fi
fi
......@@ -525,6 +543,7 @@ then
echo "[Error] Metadata server file not specified, please set one using -m <filename> flag."
exit 1
fi
else
if [[ "$VERBOSE" -eq "1" ]]; then
echo "[+] Metadata server file not specified, getting information from slurm."
......@@ -533,7 +552,7 @@ then
## then we create a file which contains the hostnames of the nodes that
## will be used to deploy the determinate set of metadata servers.
#readarray -t meta_hosts < <(head -n $HERCULES_NUM_METADATA hostfile)
head -n "$HERCULES_NUM_METADATA" hostfile > "$HERCULES_METADATA_HOSTFILE"
head -n "$HERCULES_NUM_METADATA" $FULL_HOSTFILE > "$HERCULES_METADATA_HOSTFILE"
fi
## To create an array with the meta nodes.
#printf "%s\n" ${meta_hosts[@]} > "$META_SERVER_FILE"
......@@ -552,7 +571,7 @@ echo "[+] Hercules: Starting metadata servers on ${meta_hosts[@]}"
start=`date +%s.%N`
for node in ${meta_hosts[@]}
do
RM="rm /tmp/m-hercules-$i"
RM="rm ${HERCULES_PATH}/tmp/m-hercules-$i"
COMMAND="$HERCULES_BUILD_PATH/hercules_server m $i"
## If slurm is not being used, we deploy the service by connecting
## to the node via ssh.
......@@ -582,7 +601,7 @@ done
## Wait until all metadata servers are up.
awk 'FNR==NR{a[$0]=NR;next}{print a[$0]}' meta_hostfile meta_hostfile > data2start_index
awk 'FNR==NR{a[$0]=NR;next}{print a[$0]}' "$HERCULES_METADATA_HOSTFILE" "$HERCULES_METADATA_HOSTFILE" > "${HERCULES_PATH}/tmp/data2start_index_$SLURM_JOB_ID"
WaitForServers "metadata" "m" "start" ${meta_hosts[@]}
end=`date +%s.%N`
runtime=$( echo "$end - $start" | bc -l )
......@@ -611,7 +630,7 @@ then
## then we create a file which contains the hostnames of the nodes that
## will be used to deploy the determinate set of data servers.
# readarray -t data_hosts < <(tail -n +$((HERCULES_NUM_METADATA+1)) hostfile | head -n $HERCULES_NUM_DATA)
tail -n +$((HERCULES_NUM_METADATA+1)) hostfile | head -n "$HERCULES_NUM_DATA" > "$HERCULES_DATA_HOSTFILE"
tail -n +$((HERCULES_NUM_METADATA+1)) $FULL_HOSTFILE | head -n "$HERCULES_NUM_DATA" > "$HERCULES_DATA_HOSTFILE"
fi
## To create an array with the data nodes.
# printf "%s\n" ${data_hosts[@]} > "$DATA_SERVER_FILE"
......@@ -631,14 +650,7 @@ fi
start=`date +%s.%N`
for node in ${data_hosts[@]}
do
# if [ "$i" -lt "$INIT_HERCULES_NUM_DATA" ]; then ## server is started and set as online.
# server_init_status=1
# else ## server is started but set as offline.
# server_init_status=0
# fi
# echo "[+] Running data server $i in $node..."
RM="rm /tmp/d-hercules-$i"
RM="rm ${HERCULES_PATH}/tmp/d-hercules-$i"
COMMAND="$HERCULES_BUILD_PATH/hercules_server d $i ${meta_hosts[0]} $INIT_HERCULES_NUM_DATA"
if [[ "$SLURM" -eq "0" ]]; then
# ssh $node "$RM; cd $HERCULES_BASH_PATH && $COMMAND &"
......@@ -667,7 +679,7 @@ do
done
## Wait until all data servers are up.
awk 'FNR==NR{a[$0]=NR;next}{print a[$0]}' data_hostfile data_hostfile > data2start_index
awk 'FNR==NR{a[$0]=NR;next}{print a[$0]}' $HERCULES_DATA_HOSTFILE $HERCULES_DATA_HOSTFILE > "${HERCULES_PATH}/tmp/data2start_index_$SLURM_JOB_ID"
WaitForServers "data" "d" "start" ${data_hosts[@]}
end=`date +%s.%N`
runtime=$( echo "$end - $start" | bc -l )
......@@ -706,10 +718,12 @@ case $MPI_DS in
;;
esac
# set enviroment variables.
export HERCULES_NCPN=$NUM_CLIENTS_PER_NODE
export HERCULES_NNFC=$NUM_NODES_FOR_CLIENTS
## set enviroment variables.
# mpi options.
export "HERCULES_NCPN=${NUM_CLIENTS_PER_NODE}"
export "HERCULES_NNFC=${NUM_NODES_FOR_CLIENTS}"
# data hostfile name.
export "HERCULES_DATA_HOSTFILE=${HERCULES_DATA_HOSTFILE}"
unset META_PORT
unset DATA_PORT
......@@ -731,7 +745,7 @@ unset STORAGE_SIZE
echo -e "#############################################################################
[!] 1. To export the path of the configuration file, run the following command:
${GREEN}
export HERCULES_CONF=$FILE
export HERCULES_CONF=$CURR_HERCULES_CONF_FILE
${NC}
#############################################################################"
......@@ -752,7 +766,7 @@ unset LD_PRELOAD=$HERCULES_POSIX_PRELOAD
echo -e "#############################################################################
[!] To stop the services:
${RED}
hercules stop -f $FILE
hercules stop -f $CURR_HERCULES_CONF_FILE
${NC}
#############################################################################"
......@@ -761,9 +775,4 @@ hercules stop -f $FILE
${YELLOW}
export UCX_POSIX_USE_PROC_LINK=n
${NC}
#############################################################################"
# fi
# export LD_PRELOAD=$HERCULES_PATH/build/tools/libhercules_posix.so
#############################################################################"
\ No newline at end of file
......@@ -2256,7 +2256,7 @@ int32_t get_data_location(int32_t dataset_id, int32_t data_id, int32_t op_type)
// char *curr_num_data_nodes = getenv("HERCULES_CURR_ACTIVE_DATA_NODES");
int curr_num_data_nodes_env = atoi(getenv("HERCULES_CURR_ACTIVE_DATA_NODES"));
// fprintf(stderr, "[++ HERCULES] curr_num_data_nodes_env=%d\n", curr_num_data_nodes_env);
int32_t old_num_storages = curr_imss.info.num_active_storages;
if (curr_num_data_nodes_env != curr_imss.info.num_active_storages)
{
......
......@@ -72,12 +72,17 @@ int ready(char *tmp_file_path, const char *msg)
// fprintf(stderr, "Trying to create the file %s with the message %s\n", tmp_file_path, msg);
char status[25];
char err_msg[132];
char cwd[PATH_MAX];
FILE *tmp_file; // = tmpfile(); // make the file pointer as temporary file.
if(getcwd(cwd, sizeof(cwd)) == NULL) {
perror("Error getting the current working directory.");
}
tmp_file = fopen(tmp_file_path, "w");
if (tmp_file == NULL)
{
sprintf(err_msg, "Error in creating the temporary file %s\n", tmp_file_path);
sprintf(err_msg, "Error in creating the temporary file %s, current directory is %s\n", tmp_file_path, cwd);
perror(err_msg);
return -1;
}
......
......@@ -621,7 +621,7 @@ __attribute__((constructor)) void imss_posix_init(void)
elapsed = seconds + useconds / 1e6;
init = 1;
// fprintf(stderr, "\033[0;31m The number of active servers is %d \033[0m \n", num_active_storages);
fprintf(stderr, "\033[0;31m The number of active servers is %d \033[0m \n", num_active_storages);
}
int getConfiguration()
......@@ -958,10 +958,10 @@ int close(int fd)
ret = 0;
}
slog_live("[POSIX]. Ending Hercules 'close', pathname=%s, ret=%d\n", pathname, ret);
// fprintf(stderr, "[POSIX]. Ending Hercules 'close', pathname=%s, ret=%d\n", pathname, ret);
// Set offset to 0.
// map_fd_update_value(map_fd, pathname, fd, 0);
map_fd_erase(map_fd, fd);
real_close(fd);
}
else
{
......@@ -5947,4 +5947,4 @@ int fchdir(int fd)
// }
// return ret;
// }
\ No newline at end of file
// }
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment