Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Register
Sign in
Toggle navigation
Menu
Open sidebar
admire
Hercules
Commits
a4afba6c
Commit
a4afba6c
authored
3 months ago
by
david.singh
Browse files
Options
Download
Email Patches
Plain Diff
Add deployment script improvements.
parent
8b3c7736
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
app/server.c
+15
-9
app/server.c
conf/hercules-template.conf
+4
-3
conf/hercules-template.conf
include/arg_parser.h
+1
-0
include/arg_parser.h
scripts/check-servers.sh
+8
-4
scripts/check-servers.sh
scripts/hercules
+101
-92
scripts/hercules
src/imss.c
+1
-1
src/imss.c
src/workers.c
+6
-1
src/workers.c
tools/imss_posix.c
+3
-3
tools/imss_posix.c
with
139 additions
and
113 deletions
+139
-113
app/server.c
+
15
-
9
View file @
a4afba6c
...
...
@@ -106,8 +106,8 @@ int get_number_of_active_nodes()
else
{
number_active_storage_servers
=
atoi
(
buf
);
fprintf
(
stderr
,
"[
Wake up s
erver] The new number of active data nodes is %s
\n
"
,
buf
);
slog_debug
(
"[
Wake up s
erver] The new number of active data nodes is %s
\n
"
,
buf
);
fprintf
(
stderr
,
"[
S
erver] The new number of active data nodes is %s
\n
"
,
buf
);
slog_debug
(
"[
S
erver] The new number of active data nodes is %s
\n
"
,
buf
);
}
// Close the file.
ret
=
close
(
fd
);
...
...
@@ -280,7 +280,7 @@ void handle_signal_server(int signal)
char
buf
[
10
],
action
[
20
];
;
// Get the operation number.
int
fd
=
open
(
"/tmp/hercules_pkill_operation"
,
O_RDONLY
);
int
fd
=
open
(
"
.
/tmp/hercules_pkill_operation"
,
O_RDONLY
);
if
(
fd
==
-
1
)
{
perror
(
"ERR_HERCULES_OPEN_PKILL_OPERATION"
);
...
...
@@ -347,7 +347,7 @@ void handle_signal_server(int signal)
// This file is readed by the hercules script to know if this server
// was correctly shutting down.
char
tmp_file_path
[
100
];
sprintf
(
tmp_file_path
,
"/tmp/%c-hercules-%d-%s"
,
args
.
type
,
args
.
id
,
action
);
sprintf
(
tmp_file_path
,
"
%s
/tmp/%c-hercules-%d-%s"
,
args
.
hercules_path
,
args
.
type
,
args
.
id
,
action
);
ready
(
tmp_file_path
,
"OK"
);
}
if
(
signal
==
SIGUSR2
)
// wake up this server.
...
...
@@ -363,7 +363,8 @@ void handle_signal_server(int signal)
// This file is readed by the hercules script to know if this server
// was correctly waking up.
char
tmp_file_path
[
100
];
sprintf
(
tmp_file_path
,
"/tmp/%c-hercules-%d-up"
,
args
.
type
,
args
.
id
);
sprintf
(
tmp_file_path
,
"%s/tmp/%c-hercules-%d-up"
,
args
.
hercules_path
,
args
.
type
,
args
.
id
);
fprintf
(
stderr
,
"Writting file %s
\n
"
,
tmp_file_path
);
ready
(
tmp_file_path
,
"OK"
);
}
}
...
...
@@ -451,7 +452,6 @@ int32_t main(int32_t argc, char **argv)
return
0
;
}
sprintf
(
tmp_file_path
,
"/tmp/%c-hercules-%d-start"
,
args
.
type
,
args
.
id
);
cfg
=
cfg_init
();
conf_path
=
getenv
(
"HERCULES_CONF"
);
...
...
@@ -515,6 +515,12 @@ int32_t main(int32_t argc, char **argv)
// fprintf(stderr, "Configuration file loaded: %s\n", conf_path);
}
if
(
getenv
(
"HERCULES_PATH"
)
!=
NULL
)
strcpy
(
args
.
hercules_path
,
getenv
(
"HERCULES_PATH"
));
else
if
(
cfg_get
(
cfg
,
"HERCULES_PATH"
))
strcpy
(
args
.
hercules_path
,
cfg_get
(
cfg
,
"HERCULES_PATH"
));
if
(
cfg_get
(
cfg
,
"URI"
))
{
aux
=
cfg_get
(
cfg
,
"URI"
);
...
...
@@ -624,6 +630,7 @@ int32_t main(int32_t argc, char **argv)
}
}
// IMSS_DEBUG_LEVEL = SLOG_NONE;
sprintf
(
tmp_file_path
,
"%s/tmp/%c-hercules-%d-start"
,
args
.
hercules_path
,
args
.
type
,
args
.
id
);
/***************************************************************/
/******************** PARSE INPUT ARGUMENTS ********************/
...
...
@@ -640,7 +647,6 @@ int32_t main(int32_t argc, char **argv)
slog_debug
(
"Server type=%c
\n
"
,
args
.
type
);
struct
tm
tm
=
*
localtime
(
&
t
);
sprintf
(
log_path
,
"./%c-server-%d.%02d-%02d-%02d"
,
args
.
type
,
args
.
id
,
tm
.
tm_hour
,
tm
.
tm_min
,
tm
.
tm_sec
);
// sprintf(log_path, "./%c-server", args.type);
slog_init
(
log_path
,
IMSS_DEBUG_LEVEL
,
IMSS_DEBUG_FILE
,
IMSS_DEBUG_SCREEN
,
1
,
1
,
1
,
args
.
id
);
if
(
IMSS_DEBUG_FILE
>
0
)
...
...
@@ -1202,7 +1208,7 @@ int32_t main(int32_t argc, char **argv)
// sleep(1);
// char tmp_file_path[100];
// sprintf(tmp_file_path, "/tmp/%c-hercules-%d-down", args.type, args.id);
// sprintf(tmp_file_path, "
.
/tmp/%c-hercules-%d-down", args.type, args.id);
// stop_server();
// move_blocks_2_server(args.stat_port, args.id, imss_uri, g_map);
...
...
@@ -1226,7 +1232,7 @@ int32_t main(int32_t argc, char **argv)
// ep_close(ucp_worker, pub_ep, UCP_EP_CLOSE_MODE_FORCE);
// ep_close(ucp_worker, client_ep, UCP_EP_CLOSE_MODE_FORCE);
sprintf
(
tmp_file_path
,
"/tmp/%c-hercules-%d-stop"
,
args
.
type
,
args
.
id
);
sprintf
(
tmp_file_path
,
"
%s
/tmp/%c-hercules-%d-stop"
,
args
.
hercules_path
,
args
.
type
,
args
.
id
);
ready
(
tmp_file_path
,
"OK"
);
// Free the memory buffer.
...
...
This diff is collapsed.
Click to expand it.
conf/hercules-template.conf
+
4
-
3
View file @
a4afba6c
...
...
@@ -8,7 +8,7 @@ BLOCK_SIZE = 512
MOUNT_POINT
= /
mnt
/
hercules
/
# Path where the Hercules project is located
HERCULES_PATH
= /
beegfs
/
home
/
j
avi
er
.
garciablas
/
hercules
HERCULES_PATH
= /
beegfs
/
home
/
d
avi
d
.
singh
/
hercules
# Port listening in the metadata node service
METADATA_PORT
=
75000
...
...
@@ -35,9 +35,10 @@ ATTACHED = 0
MALLEABILITY
=
0
UPPER_BOUND_MALLEABILITY
=
0
LOWER_BOUND_MALLEABILITY
=
0
INIT_NUM_DATA_SERVERS
=
1
# File containing a list of nodes serving as data nodes
DATA_HOSTFILE
=
data_hostfile
DATA_HOSTFILE
=
/
beegfs
/
home
/
david
.
singh
/
EpiGraphFlexMPI
/
data_hostfile
# Number of threads attending data requests
THREAD_POOL
=
1
...
...
@@ -46,7 +47,7 @@ THREAD_POOL = 1
STORAGE_SIZE
=
1
# No limit
# File containing a list of nodes serving as metadata nodes
METADATA_HOSTFILE
=
meta_hostfile
METADATA_HOSTFILE
=
/
beegfs
/
home
/
david
.
singh
/
EpiGraphFlexMPI
/
meta_hostfile
# Replication factor (1, 2 or 3)
REPL_FACTOR
=
1
...
...
This diff is collapsed.
Click to expand it.
include/arg_parser.h
+
1
-
0
View file @
a4afba6c
...
...
@@ -36,6 +36,7 @@ struct arguments
uint64_t
port
;
/* port arg to '-p' */
int64_t
bufsize
;
/* buffer size arg to '-b' */
char
imss_uri
[
32
];
/* IMSS URI arg to '-i' */
char
hercules_path
[
PATH_MAX
];
/* hercules path */
char
*
stat_host
;
/* Metadata server hostname arg to '-H' */
int64_t
stat_port
;
/* Metadata server port number arg to '-P' */
int64_t
num_servers
;
/* number of data servers arg to '-n' */
...
...
This diff is collapsed.
Click to expand it.
scripts/check-servers.sh
+
8
-
4
View file @
a4afba6c
...
...
@@ -3,13 +3,13 @@
SERVER_TYPE
=
$1
SERVER_NUMBER
=
$2
ACTION
=
$3
# expected string action, e.g., down when servers are stopped.
ATTEMPS
=
6
0
ATTEMPS
=
1
0
i
=
1
FILE
=
"/tmp/
$SERVER_TYPE
-hercules-
$SERVER_NUMBER
-
$ACTION
"
FILE
=
"
.
/tmp/
$SERVER_TYPE
-hercules-
$SERVER_NUMBER
-
$ACTION
"
## Checks if the file exists.
until
[
-f
$FILE
]
;
do
#
echo "Waiting for $FILE, attemp $i"
echo
"Waiting for
$FILE
, attemp
$i
"
i
=
$((
$i
+
1
))
## Waits "attemps" times, then an error is return.
if
[
$i
-gt
$ATTEMPS
]
;
then
...
...
@@ -23,8 +23,12 @@ until [ -f $FILE ]; do
done
## Checks if the server was deploy correctly.
STATUS
=
$(
cat
$FILE
|
grep
"STATUS"
|
awk
'{print $3}'
)
STATUS
=
$(
cat
--
$FILE
|
grep
"STATUS"
|
awk
'{print $3}'
)
echo
"STATUS=
$STATUS
"
## Removes the file.
set
-x
rm
${
FILE
}
set
+x
if
[
"
$STATUS
"
!=
"OK"
]
;
then
# echo "[X] Error deploying server $SERVER_NUMBER."
exit
1
...
...
This diff is collapsed.
Click to expand it.
scripts/hercules
+
101
-
92
View file @
a4afba6c
#!/bin/bash
## Last modification:
06/09
/2024
## Last modification:
18/11
/2024
## Genaro Sanchez-Gallegos
#
set -x
#set -x
#######################################################
#######################################################
## To stop servers.
StopServers
()
{
NAME
=
$1
# Server type ("metadata" or "data").
hosts
=
$2
# List of hostnames.
shift
# Operation to be apply to the server.
# 0 indicates the server will be killed, and
# 1 indicates the sever will be suspend
# (e.g., Hercules shrink when malleability is enabled).
OPERATION
=
$3
OPERATION
=
$1
shift
hosts
=(
"
$@
"
)
# List of hostnames.
echo
"# Hercules: Stopping
$NAME
servers in
${
hosts
[@]
}
"
if
[[
"
$VERBOSE
"
-eq
"1"
]]
;
then
...
...
@@ -22,7 +24,8 @@ StopServers() {
for
node
in
"
${
hosts
[@]
}
"
do
# Set the action to be doing by the servers when they received the pkill signal.
(
ssh
$node
"echo
$OPERATION
> /tmp/hercules_pkill_operation"
)
# ( ssh $node "echo $OPERATION > ./tmp/hercules_pkill_operation" )
echo
$OPERATION
>
"
${
HERCULES_PATH
}
/tmp/hercules_pkill_operation"
# Kill threads and finish the server.
(
ssh
$node
"pkill -SIGUSR1 hercules_server"
)
done
...
...
@@ -30,7 +33,9 @@ StopServers() {
WakeUpServers
()
{
NAME
=
$1
# Server type ("meta" or "data").
hosts
=
$2
# List of hostnames.
shift
hosts
=(
"
$@
"
)
# List of hostnames.
# hosts=$2 # List of hostnames.
# Wake up desired servers.
for
node
in
"
${
hosts
[@]
}
"
...
...
@@ -38,8 +43,9 @@ WakeUpServers() {
### echo "ssh $node pkill hercules_server"
(
ssh
$node
"pkill -SIGUSR2 hercules_server"
)
done
awk
'FNR==NR{a[$0]=NR;next}{print a[$0]}'
data_hostfile data2start_hostfile
>
data2up_index
set
-x
awk
'FNR==NR{a[$0]=NR;next}{print a[$0]}'
${
HERCULES_DATA_HOSTFILE
}
${
HERCULES_DATA_HOSTFILE_PROV
}
>
${
HERCULES_PATH
}
/tmp/data2up_index_
$SLURM_JOB_ID
set
+x
WaitForServers
"data"
"d"
"up"
${
hosts
[@]
}
}
...
...
@@ -60,70 +66,44 @@ WaitForServers() {
shift
hosts
=(
"
$@
"
)
file_name
=
"data2
${
ACTION
}
_index"
readarray
-t
server_index_arr <
$file_name
#data2start_index
file_name
=
"
${
HERCULES_PATH
}
/tmp/
data2
${
ACTION
}
_index
_
$SLURM_JOB_ID
"
readarray
-t
server_index_arr <
$file_name
#data2start_index
-<SLURM-JOD-ID>
# echo "hostnames=${hosts[@]}"
ATTEMPS
=
1000
for
node
in
"
${
hosts
[@]
}
"
do
# FILE="/tmp/$SERVER_TYPE-hercules-$((server_index_arr[$SERVER_ID]-1))-$ACTION"
SERVER_NUMBER
=
$((
server_index_arr[
$SERVER_ID
]-
1
))
if
[[
"
$VERBOSE
"
-eq
"1"
]]
;
then
echo
"# Hercules:
$ACTION
$SERVER_NAME
server
$SERVER_NUMBER
on
$node
"
fi
COMMAND
=
"
$HERCULES_BASH_PATH
/check-servers.sh
$SERVER_TYPE
$SERVER_NUMBER
$ACTION
"
### echo "[+] Running comprobation in $node... $COMMAND"
# if slurm is not available.
if
[[
"
$SLURM
"
-eq
"0"
]]
;
then
if
[
-z
"
$(
ssh-keygen
-F
$node
)
"
]
;
then
ssh-keyscan
-H
$node
>>
~/.ssh/known_hosts
fi
if
[[
"
$VERBOSE
"
-eq
"1"
]]
;
then
echo
"Running ssh
$node
$COMMAND
"
echo
"Running ssh
$node
--
$COMMAND
"
fi
ret
=
$(
ssh
$node
"
$COMMAND
"
)
ret
=
$(
ssh
$node
--
"
$COMMAND
"
)
if
[[
"
$VERBOSE
"
-eq
"1"
]]
;
then
echo
"Return code
$ret
"
fi
else
# if slurm is available.
ret
=
$(
srun
-N
1
-n
1
-c
1
-m
block:block:block
--mem
=
1G
-w
$node
$COMMAND
)
set
-x
ret
=
$(
srun
-N
1
-n
1
-c
1
-m
block:block:block
--mem
=
1G
-w
$node
--
$COMMAND
)
set
+x
fi
ret
=
$?
if
[
"
$ret
"
-gt
0
]
;
then
echo
"[Error:
$ret
] It has not been possible to "
${
ACTION
}
" a
$SERVER_NAME
server on
$node
, please verify the configuration file and logs."
#StopServers $SERVER_NAME $hosts 1
exit
1
fi
## Checks if the file exists.
## until [ -f $FILE ]; do
# ret=0
# until [ "$ret" -gt 0 ]; do
# # echo "Waiting for $FILE, attemp $i"
# i=$(($i + 1))
# ## Waits "attemps" times, then an error is return.
# if [ $i -gt $ATTEMPS ]; then
# exit 1
# fi
# t=$(($i % 5))
# if [ $t -eq 0 ]; then
# echo "[+][$HOSTNAME] Waiting for server $((server_index_arr[$SERVER_ID]-1)), $FILE"
# fi
# sleep 1
# done
# ## Checks if the server was deploy correctly.
# STATUS=$(cat $FILE | grep "STATUS" | awk '{print $3}')
# if [ "$STATUS" != "OK" ]; then
# # echo "[X] Error deploying server $SERVER_NUMBER."
# exit 1
# fi
SERVER_ID
=
$((
SERVER_ID+1
))
if
[[
"
$VERBOSE
"
-eq
"1"
]]
;
then
echo
"[OK]
$SERVER_NAME
$i
server running in
$node
"
fi
fi
done
}
...
...
@@ -246,7 +226,7 @@ fi
if
[[
"
$VERBOSE
"
-eq
"1"
]]
;
then
echo
"Reading configuration from
$FILE
"
fi
export
HERCULES_CONF
=
"
$FILE
"
#
export HERCULES_CONF="$FILE"
## Checks if a configuration file was set.
...
...
@@ -293,11 +273,6 @@ if [[ "$SLURM" -eq "0" && ! $STATUS = "stop" ]]; then
# SERVERS_HOSTFILES_OPTION=1
#SERVERS_HOSTFILES_OPTION=$(($SERVERS_HOSTFILES_OPTION + 1))
fi
# if a server hostfile is missing.
#if [ ! "$SERVERS_HOSTFILES_OPTION" -eq "2" ]; then
# echo "One or more hostfiles missing for the servers, please specify them with: -m <metadata_hostfile> -d <data_hostfile>"
# exit 0
#fi
fi
...
...
@@ -325,7 +300,6 @@ if [[ "$VERBOSE" -eq "1" ]]; then
echo
"Hercules path -
$HERCULES_PATH
"
fi
## Read configuration file.
export
HERCULES_MOUNT_POINT
=
$(
cat
$FILE
|
grep
"
\<
MOUNT_POINT
\>
"
|
head
-1
|
awk
'{print $3}'
)
META_PORT
=
$(
cat
$FILE
|
grep
"
\<
METADATA_PORT
\>
"
|
awk
'{print $3}'
)
...
...
@@ -347,8 +321,20 @@ if [ -z "$INIT_HERCULES_NUM_DATA" ] || [ "$INIT_HERCULES_NUM_DATA" = "0" ] ; the
echo
"INIT_HERCULES_NUM_DATA =
$INIT_HERCULES_NUM_DATA
"
fi
# set -x
if
[[
"
$SLURM
"
-eq
"1"
]]
;
then
## If slurm is enabled, we concat the job id to the configuration file to create a new one.
CURR_HERCULES_CONF_FILE
=
"
${
FILE
}
_
${
SLURM_JOB_ID
}
"
cp
$FILE
$CURR_HERCULES_CONF_FILE
else
CURR_HERCULES_CONF_FILE
=
"
${
FILE
}
"
fi
export
HERCULES_CONF
=
$CURR_HERCULES_CONF_FILE
# exit 0
# echo "+ + + Initial number of data nodes is $INIT_HERCULES_NUM_DATA/$HERCULES_NUM_DATA"
export
HERCULES_INIT_NUM_DATA
=
$INIT_HERCULES_NUM_DATA
export
"
HERCULES_INIT_NUM_DATA=
$
{
INIT_HERCULES_NUM_DATA
}
"
## \< \> to match exact word.
...
...
@@ -374,19 +360,19 @@ then
## the hercules "remove" option to stop only some
## data servers. "stop" option will stop all running
## data servers.
cat
$HERCULES_DATA_HOSTFILE
>
data2stop_hostfile
cat
$HERCULES_DATA_HOSTFILE
>
"
data2stop_hostfile
_
$SLURM_JOB_ID
"
## Creates an array with the data servers hostnames.
readarray
-t
hosts <
$HERCULES_DATA_HOSTFILE
## Stop the data servers.
StopServers
"data"
${
hosts
[@]
}
1
StopServers
"data"
1
${
hosts
[@]
}
## Checks if user wants to wait until all servers are stopped.
# 1 indicates that this function should check if
# all servers have stopped correctly (synchronous).
# 0 in other case (asynchronous).
if
[[
"
$WAIT_SERVERS
"
-eq
"1"
]]
;
then
awk
'FNR==NR{a[$0]=NR;next}{print a[$0]}'
data_hostfile data_hostfile
>
"
data2
${
STATUS
}
_index"
awk
'FNR==NR{a[$0]=NR;next}{print a[$0]}'
${
HERCULES_DATA_HOSTFILE
}
${
HERCULES_DATA_HOSTFILE
}
>
"
${
HERCULES_PATH
}
/tmp/
data2
${
STATUS
}
_index
_
$SLURM_JOB_ID
"
WaitForServers
"data"
"d"
"
${
STATUS
}
"
${
hosts
[@]
}
else
echo
"No waiting for data servers to be stopped."
...
...
@@ -398,17 +384,17 @@ then
exit
0
fi
# Stop metadata servers.
cat
$HERCULES_METADATA_HOSTFILE
>
data2stop_hostfile
cat
$HERCULES_METADATA_HOSTFILE
>
"
data2stop_hostfile
_
$SLURM_JOB_ID
"
readarray
-t
hosts <
$HERCULES_METADATA_HOSTFILE
# echo "meta hosts=$hosts"
StopServers
"metadata"
${
hosts
[@]
}
1
StopServers
"metadata"
1
${
hosts
[@]
}
## Checks if user wants to wait until all servers are stopped.
# 1 indicates that this function should check if
# all servers have stopped correctly (synchronous).
# 0 in other case (asynchronous).
if
[[
"
$WAIT_SERVERS
"
-eq
"1"
]]
;
then
awk
'FNR==NR{a[$0]=NR;next}{print a[$0]}'
meta_hostfile meta_hostfile
>
"
data2
${
STATUS
}
_index"
awk
'FNR==NR{a[$0]=NR;next}{print a[$0]}'
${
HERCULES_METADATA_HOSTFILE
}
${
HERCULES_METADATA_HOSTFILE
}
>
"
${
HERCULES_PATH
}
/tmp/
data2
${
STATUS
}
_index"
WaitForServers
"metadata"
"m"
"
${
STATUS
}
"
${
hosts
[@]
}
else
echo
"No waiting for data servers to be stopped."
...
...
@@ -431,14 +417,14 @@ then
echo
"Data server file "
$DATA_SERVER_FILE
" does not exists"
exit
0
fi
HERCULES_DATA_HOSTFILE
=
$DATA_SERVER_FILE
HERCULES_DATA_HOSTFILE
_PROV
=
$DATA_SERVER_FILE
else
echo
"No data server file was provided to add more servers."
exit
0
fi
readarray
-t
hosts <
$HERCULES_DATA_HOSTFILE
readarray
-t
hosts <
$HERCULES_DATA_HOSTFILE
_PROV
# echo "data hosts=$hosts"
WakeUpServers
"data"
$hosts
WakeUpServers
"data"
$
{
hosts
[@]
}
# sleep 10
exit
0
fi
...
...
@@ -466,11 +452,12 @@ then
readarray
-t
hosts <
$HERCULES_DATA_HOSTFILE
# 1 = Stops some servers synchronously.
# 0 = Do not kill the process.
StopServers
"data"
${
hosts
[@]
}
0
StopServers
"data"
0
${
hosts
[@]
}
# For "remove" operations we awalys wait for selected
# servers to be removed.
awk
'FNR==NR{a[$0]=NR;next}{print a[$0]}'
data_hostfile data_hostfile
>
"data2
${
STATUS
}
_index"
awk
'FNR==NR{a[$0]=NR;next}{print a[$0]}'
$HERCULES_DATA_HOSTFILE
$HERCULES_DATA_HOSTFILE
>
"
${
HERCULES_PATH
}
/tmp/data2
${
STATUS
}
_index_
$SLURM_JOB_ID
"
WaitForServers
"data"
"d"
"
${
STATUS
}
"
${
hosts
[@]
}
# Metadata server is not stopped.
...
...
@@ -478,6 +465,37 @@ then
exit
0
fi
## If not empty, the field "METADATA_HOSTFILE" and "DATA_HOSTFILE" field was set on the configuration file.
if
[
!
-z
"
$HERCULES_METADATA_HOSTFILE
"
]
;
then
if
[[
"
$SLURM
"
-eq
"1"
]]
;
then
# If slurm is enabled, we added the slurm job id to the meta hostfile.
HERCULES_METADATA_HOSTFILE+
=
"_"
$SLURM_JOB_ID
## Replace all "/" with "\/" because "/" is an special character for the sed command.
ESCAPED_HERCULES_METADATA_HOSTFILE
=
"
${
HERCULES_METADATA_HOSTFILE
//\//\\/
}
"
## Replace the old metadata hostfile name with the new one containing the job id.
sed
-i
"s/^METADATA_HOSTFILE = .*/METADATA_HOSTFILE =
$ESCAPED_HERCULES_METADATA_HOSTFILE
/g"
"
$CURR_HERCULES_CONF_FILE
"
fi
fi
if
[
!
-z
"
$HERCULES_DATA_HOSTFILE
"
]
;
then
if
[[
"
$SLURM
"
-eq
"1"
]]
;
then
# If slurm is enabled, we added the slurm job id to the data hostfile.
HERCULES_DATA_HOSTFILE+
=
"_"
$SLURM_JOB_ID
## Replace all "/" with "\/" because "/" is an special character for the sed command.
ESCAPED_HERCULES_DATA_HOSTFILE
=
"
${
HERCULES_DATA_HOSTFILE
//\//\\/
}
"
## Replace the old data hostfile name with the new one containing the job id.
sed
-i
"s/^DATA_HOSTFILE = .*/DATA_HOSTFILE =
$ESCAPED_HERCULES_DATA_HOSTFILE
/g"
"
$CURR_HERCULES_CONF_FILE
"
fi
fi
## Creates the initial "hercules_num_act_nodes" file.
echo
$HERCULES_INIT_NUM_DATA
>
hercules_num_act_nodes
## Set the name that will be used to create the
if
[[
"
$SLURM
"
-eq
"1"
]]
;
then
FULL_HOSTFILE
=
"hostfile_
$SLURM_JOB_ID
"
else
FULL_HOSTFILE
=
"hostfile"
fi
## If hostfiles was not set, then we create a hostfile containing the allocated nodes.
if
[[
"
$VERBOSE
"
-eq
"1"
]]
;
then
...
...
@@ -485,7 +503,7 @@ if [[ "$VERBOSE" -eq "1" ]]; then
fi
if
[[
$SERVERS_HOSTFILES_OPTION
-eq
"0"
]]
;
then
#srun -pernode hostname |sort > hostfile
scontrol show hostnames
"
$SLURM_JOB_NODELIST
"
>
hostfile
scontrol show hostnames
"
$SLURM_JOB_NODELIST
"
>
$FULL_HOSTFILE
# scontrol show hostnames "$SLURM_JOB_NODELIST"
fi
...
...
@@ -501,11 +519,11 @@ if [ -z "$HERCULES_MPI_HOSTFILE_NAME" ]; then
fi
## Creates a client hostfile using some of the allocates nodes.
if
[[
$SLURM
-eq
"1"
]]
;
then
HERCULES_MPI_HOSTFILE_NAME
=
"client_hostfile
"
HERCULES_MPI_HOSTFILE_NAME
=
"client_hostfile
_"
$SLURM_JOB_ID
if
[[
$ATTACHED
-eq
"1"
]]
;
then
tail
-n
+
$((
HERCULES_NUM_METADATA+1
))
hostfile
|
head
-n
$NUM_NODES_FOR_CLIENTS
>
$HERCULES_MPI_HOSTFILE_NAME
tail
-n
+
$((
HERCULES_NUM_METADATA+1
))
$FULL_HOSTFILE
|
head
-n
$NUM_NODES_FOR_CLIENTS
>
$HERCULES_MPI_HOSTFILE_NAME
else
tail
-n
+
$((
HERCULES_NUM_METADATA+HERCULES_NUM_DATA+1
))
hostfile
|
head
-n
$NUM_NODES_FOR_CLIENTS
>
$HERCULES_MPI_HOSTFILE_NAME
tail
-n
+
$((
HERCULES_NUM_METADATA+HERCULES_NUM_DATA+1
))
$FULL_HOSTFILE
|
head
-n
$NUM_NODES_FOR_CLIENTS
>
$HERCULES_MPI_HOSTFILE_NAME
fi
fi
fi
...
...
@@ -525,6 +543,7 @@ then
echo
"[Error] Metadata server file not specified, please set one using -m <filename> flag."
exit
1
fi
else
if
[[
"
$VERBOSE
"
-eq
"1"
]]
;
then
echo
"[+] Metadata server file not specified, getting information from slurm."
...
...
@@ -533,7 +552,7 @@ then
## then we create a file which contains the hostnames of the nodes that
## will be used to deploy the determinate set of metadata servers.
#readarray -t meta_hosts < <(head -n $HERCULES_NUM_METADATA hostfile)
head
-n
"
$HERCULES_NUM_METADATA
"
hostfile
>
"
$HERCULES_METADATA_HOSTFILE
"
head
-n
"
$HERCULES_NUM_METADATA
"
$FULL_HOSTFILE
>
"
$HERCULES_METADATA_HOSTFILE
"
fi
## To create an array with the meta nodes.
#printf "%s\n" ${meta_hosts[@]} > "$META_SERVER_FILE"
...
...
@@ -552,7 +571,7 @@ echo "[+] Hercules: Starting metadata servers on ${meta_hosts[@]}"
start
=
`
date
+%s.%N
`
for
node
in
${
meta_hosts
[@]
}
do
RM
=
"rm /tmp/m-hercules-
$i
"
RM
=
"rm
${
HERCULES_PATH
}
/tmp/m-hercules-
$i
"
COMMAND
=
"
$HERCULES_BUILD_PATH
/hercules_server m
$i
"
## If slurm is not being used, we deploy the service by connecting
## to the node via ssh.
...
...
@@ -582,7 +601,7 @@ done
## Wait until all metadata servers are up.
awk
'FNR==NR{a[$0]=NR;next}{print a[$0]}'
meta_hostfile meta_hostfile
>
data2start_index
awk
'FNR==NR{a[$0]=NR;next}{print a[$0]}'
"
$HERCULES_METADATA_HOSTFILE
"
"
$HERCULES_METADATA_HOSTFILE
"
>
"
${
HERCULES_PATH
}
/tmp/
data2start_index
_
$SLURM_JOB_ID
"
WaitForServers
"metadata"
"m"
"start"
${
meta_hosts
[@]
}
end
=
`
date
+%s.%N
`
runtime
=
$(
echo
"
$end
-
$start
"
| bc
-l
)
...
...
@@ -611,7 +630,7 @@ then
## then we create a file which contains the hostnames of the nodes that
## will be used to deploy the determinate set of data servers.
# readarray -t data_hosts < <(tail -n +$((HERCULES_NUM_METADATA+1)) hostfile | head -n $HERCULES_NUM_DATA)
tail
-n
+
$((
HERCULES_NUM_METADATA+1
))
hostfile
|
head
-n
"
$HERCULES_NUM_DATA
"
>
"
$HERCULES_DATA_HOSTFILE
"
tail
-n
+
$((
HERCULES_NUM_METADATA+1
))
$FULL_HOSTFILE
|
head
-n
"
$HERCULES_NUM_DATA
"
>
"
$HERCULES_DATA_HOSTFILE
"
fi
## To create an array with the data nodes.
# printf "%s\n" ${data_hosts[@]} > "$DATA_SERVER_FILE"
...
...
@@ -631,14 +650,7 @@ fi
start
=
`
date
+%s.%N
`
for
node
in
${
data_hosts
[@]
}
do
# if [ "$i" -lt "$INIT_HERCULES_NUM_DATA" ]; then ## server is started and set as online.
# server_init_status=1
# else ## server is started but set as offline.
# server_init_status=0
# fi
# echo "[+] Running data server $i in $node..."
RM
=
"rm /tmp/d-hercules-
$i
"
RM
=
"rm
${
HERCULES_PATH
}
/tmp/d-hercules-
$i
"
COMMAND
=
"
$HERCULES_BUILD_PATH
/hercules_server d
$i
${
meta_hosts
[0]
}
$INIT_HERCULES_NUM_DATA
"
if
[[
"
$SLURM
"
-eq
"0"
]]
;
then
# ssh $node "$RM; cd $HERCULES_BASH_PATH && $COMMAND &"
...
...
@@ -667,7 +679,7 @@ do
done
## Wait until all data servers are up.
awk
'FNR==NR{a[$0]=NR;next}{print a[$0]}'
data_hostfile data_hostfile
>
data2start_index
awk
'FNR==NR{a[$0]=NR;next}{print a[$0]}'
$HERCULES_DATA_HOSTFILE
$HERCULES_DATA_HOSTFILE
>
"
${
HERCULES_PATH
}
/tmp/
data2start_index
_
$SLURM_JOB_ID
"
WaitForServers
"data"
"d"
"start"
${
data_hosts
[@]
}
end
=
`
date
+%s.%N
`
runtime
=
$(
echo
"
$end
-
$start
"
| bc
-l
)
...
...
@@ -706,10 +718,12 @@ case $MPI_DS in
;;
esac
# set enviroment variables.
export
HERCULES_NCPN
=
$NUM_CLIENTS_PER_NODE
export
HERCULES_NNFC
=
$NUM_NODES_FOR_CLIENTS
## set enviroment variables.
# mpi options.
export
"HERCULES_NCPN=
${
NUM_CLIENTS_PER_NODE
}
"
export
"HERCULES_NNFC=
${
NUM_NODES_FOR_CLIENTS
}
"
# data hostfile name.
export
"HERCULES_DATA_HOSTFILE=
${
HERCULES_DATA_HOSTFILE
}
"
unset
META_PORT
unset
DATA_PORT
...
...
@@ -731,7 +745,7 @@ unset STORAGE_SIZE
echo
-e
"#############################################################################
[!] 1. To export the path of the configuration file, run the following command:
${
GREEN
}
export HERCULES_CONF=
$FILE
export HERCULES_CONF=
$
CURR_HERCULES_CONF_
FILE
${
NC
}
#############################################################################"
...
...
@@ -752,7 +766,7 @@ unset LD_PRELOAD=$HERCULES_POSIX_PRELOAD
echo
-e
"#############################################################################
[!] To stop the services:
${
RED
}
hercules stop -f
$FILE
hercules stop -f
$
CURR_HERCULES_CONF_
FILE
${
NC
}
#############################################################################"
...
...
@@ -761,9 +775,4 @@ hercules stop -f $FILE
${
YELLOW
}
export UCX_POSIX_USE_PROC_LINK=n
${
NC
}
#############################################################################"
# fi
# export LD_PRELOAD=$HERCULES_PATH/build/tools/libhercules_posix.so
#############################################################################"
\ No newline at end of file
This diff is collapsed.
Click to expand it.
src/imss.c
+
1
-
1
View file @
a4afba6c
...
...
@@ -2256,7 +2256,7 @@ int32_t get_data_location(int32_t dataset_id, int32_t data_id, int32_t op_type)
// char *curr_num_data_nodes = getenv("HERCULES_CURR_ACTIVE_DATA_NODES");
int
curr_num_data_nodes_env
=
atoi
(
getenv
(
"HERCULES_CURR_ACTIVE_DATA_NODES"
));
// fprintf(stderr, "[++ HERCULES] curr_num_data_nodes_env=%d\n", curr_num_data_nodes_env);
int32_t
old_num_storages
=
curr_imss
.
info
.
num_active_storages
;
if
(
curr_num_data_nodes_env
!=
curr_imss
.
info
.
num_active_storages
)
{
...
...
This diff is collapsed.
Click to expand it.
src/workers.c
+
6
-
1
View file @
a4afba6c
...
...
@@ -72,12 +72,17 @@ int ready(char *tmp_file_path, const char *msg)
// fprintf(stderr, "Trying to create the file %s with the message %s\n", tmp_file_path, msg);
char
status
[
25
];
char
err_msg
[
132
];
char
cwd
[
PATH_MAX
];
FILE
*
tmp_file
;
// = tmpfile(); // make the file pointer as temporary file.
if
(
getcwd
(
cwd
,
sizeof
(
cwd
))
==
NULL
)
{
perror
(
"Error getting the current working directory."
);
}
tmp_file
=
fopen
(
tmp_file_path
,
"w"
);
if
(
tmp_file
==
NULL
)
{
sprintf
(
err_msg
,
"Error in creating the temporary file %s
\n
"
,
tmp_file_path
);
sprintf
(
err_msg
,
"Error in creating the temporary file
%s, current directory is
%s
\n
"
,
tmp_file_path
,
cwd
);
perror
(
err_msg
);
return
-
1
;
}
...
...
This diff is collapsed.
Click to expand it.
tools/imss_posix.c
+
3
-
3
View file @
a4afba6c
...
...
@@ -621,7 +621,7 @@ __attribute__((constructor)) void imss_posix_init(void)
elapsed
=
seconds
+
useconds
/
1e6
;
init
=
1
;
//
fprintf(stderr, "\033[0;31m The number of active servers is %d \033[0m \n", num_active_storages);
fprintf
(
stderr
,
"
\033
[0;31m The number of active servers is %d
\033
[0m
\n
"
,
num_active_storages
);
}
int
getConfiguration
()
...
...
@@ -958,10 +958,10 @@ int close(int fd)
ret
=
0
;
}
slog_live
(
"[POSIX]. Ending Hercules 'close', pathname=%s, ret=%d
\n
"
,
pathname
,
ret
);
// fprintf(stderr, "[POSIX]. Ending Hercules 'close', pathname=%s, ret=%d\n", pathname, ret);
// Set offset to 0.
// map_fd_update_value(map_fd, pathname, fd, 0);
map_fd_erase
(
map_fd
,
fd
);
real_close
(
fd
);
}
else
{
...
...
@@ -5947,4 +5947,4 @@ int fchdir(int fd)
// }
// return ret;
// }
\ No newline at end of file
// }
This diff is collapsed.
Click to expand it.
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment