[오픈소스] GOQ 구축 (goq: a queuing and job management system fir for the cloud. Written in GO (golang)).
- 꼬반
- 2017. 3. 10. 09:01
option space PXE;
option PXE.mtftp-ip code 1 = ip-address;
option PXE.mtftp-cport code 2 = unsigned integer 16;
option PXE.mtftp-sport code 3 = unsigned integer 16;
option PXE.mtftp-tmout code 4 = unsigned integer 8;
option PXE.mtftp-delay code 5 = unsigned integer 8;
option arch code 93 = unsigned integer 16; # RFC4578
subnet 10.10.10.0 netmask 255.255.255.0 {
option routers 10.10.10.10;
range dynamic-bootp 10.10.10.200 10.10.10.250;
allow booting;
allow bootp;
next-server 10.10.10.10;
filename "pxelinux.0";
}
:wq!
# systemctl enable dhcpd
# systemctl start dhcpd
# firewall-cmd --permanent --add-masquerade
# firewall-cmd --permanent --add-rich-rule='rule family="ipv4" source address="10.10.10.0/24" accept'
# firewall-cmd --reload
vim ~/.bashrc
export GOROOT=/share/apps/go
export PATH=$PATH:$GOROOT/bin
export GOPATH=/share/workspace
export PATH=$PATH:$GOPATH/bin
export GOQ_HOME=/root
추가 후
:wq!
# go version
go version go1.6.2 linux/amd64
github src : https://github.com/glycerine/goq
# go get -t -u github.com/glycerine/mangos/compat
# go get -u -t github.com/glycerine/goq
# cd $GOPATH/src/github.com/glycerine/goq; make; go test -v
# goq init
# cd $GOQ_HOME
# vim serverloc
export GOQ_JSERV_IP=10.10.10.10
:wq!
마스터
$ cd $GOQ_HOME
$ goq init # only needed once.
$ nohup goq serve & # start the central server
워커
$ ssh computenode
$ for i in $(seq 1 $(cat /proc/cpuinfo |grep processor|wc -l)); do
/usr/bin/nohup goq work forever & done
잡 서밋
$ cd somewhere/where/the/job/wants/to/start
# start by doing 'goq sub' on the same machine
# that 'goq serve' was launched on. Just to learn the system.
$ goq sub ./myjobscript
테스트
마스터에서 확인
root 4060 2959 0 14:33 pts/0 00:00:00 goq serve
워커 1
root 12153 11924 0 14:36 pts/0 00:00:00 goq work forever
root 12154 11924 0 14:36 pts/0 00:00:00 goq work forever
워커 2
root 12215 12004 0 14:36 pts/0 00:00:00 goq work forever
root 12216 12004 0 14:36 pts/0 00:00:00 goq work forever
테스트 잡 서밋
[root@goq .goq]# goq sub /share/workspace/src/github.com/glycerine/goq/bin/sleep20.sh
[pid 5655] submitted job 5 to server at 'tcp://10.10.10.10:1776'.
[root@goq .goq]# goq stat
[pid 5661] stats for job server 'tcp://10.10.10.10:1776':
runQlen=1
waitingJobs=0
waitingWorkers=3
jservPid=4060
finishedJobsCount=3
droppedBadSigCount=0
cancelledJobCount=0
nextJobId=6
jservIP=10.10.10.10
jservPort=1776
badNonceCount=0
maxShow=10
runq 000000 runtime: < 1 heartbeat RunningJob[jid 5] = '/share/workspace/src/github.com/glycerine/goq/bin/sleep20.sh ' on worker 'tcp://10.10.10.201:39504'/pid:0. Lastping: none.
finished: [jid 1] total-time: 20.006763604s. done: 2016-06-21 14:14:45.162962843 +0900 KST. cmd: '/share/workspace/src/github.com/glycerine/goq/bin/sleep20.sh []' finished on worker 'tcp://192.168.150.73:46641'/pid:3531. . Err: ''
finished: [jid 3] total-time: 145.624801ms. done: 2016-06-21 14:37:12.110080928 +0900 KST. cmd: '/share/workspace/src/github.com/glycerine/goq/bin/good.sh []' finished on worker 'tcp://10.10.10.200:45736'/pid:12168. . Err: ''
finished: [jid 4] total-time: 20.019386787s. done: 2016-06-21 14:38:33.327349501 +0900 KST. cmd: '/share/workspace/src/github.com/glycerine/goq/bin/sleep20.sh []' finished on worker 'tcp://10.10.10.200:43298'/pid:12172. . Err: ''
--- goq security status---
summary-bad-signature-msgs: 0
summary-bad-nonce-msg: 0
--- goq progress status ---
summary-jobs-running: 1
summary-jobs-waiting: 0
summary-known-jobs: 1
summary-workers-waiting: 3
summary-cancelled-jobs: 0
summary-jobs-finished: 3
--- goq end status at time: 2016-06-21 15:44:40.540185547 +0900 KST ---
[root@goq .goq]# goq stat
[pid 5667] stats for job server 'tcp://10.10.10.10:1776':
runQlen=0
waitingJobs=0
waitingWorkers=4
jservPid=4060
finishedJobsCount=4
droppedBadSigCount=0
cancelledJobCount=0
nextJobId=6
jservIP=10.10.10.10
jservPort=1776
badNonceCount=0
maxShow=10
finished: [jid 1] total-time: 20.006763604s. done: 2016-06-21 14:14:45.162962843 +0900 KST. cmd: '/share/workspace/src/github.com/glycerine/goq/bin/sleep20.sh []' finished on worker 'tcp://192.168.150.73:46641'/pid:3531. . Err: ''
finished: [jid 3] total-time: 145.624801ms. done: 2016-06-21 14:37:12.110080928 +0900 KST. cmd: '/share/workspace/src/github.com/glycerine/goq/bin/good.sh []' finished on worker 'tcp://10.10.10.200:45736'/pid:12168. . Err: ''
finished: [jid 4] total-time: 20.019386787s. done: 2016-06-21 14:38:33.327349501 +0900 KST. cmd: '/share/workspace/src/github.com/glycerine/goq/bin/sleep20.sh []' finished on worker 'tcp://10.10.10.200:43298'/pid:12172. . Err: ''
finished: [jid 5] total-time: 20.031627872s. done: 2016-06-21 15:44:58.280047843 +0900 KST. cmd: '/share/workspace/src/github.com/glycerine/goq/bin/sleep20.sh []' finished on worker 'tcp://10.10.10.201:39504'/pid:12970. . Err: ''
--- goq security status---
summary-bad-signature-msgs: 0
summary-bad-nonce-msg: 0
--- goq progress status ---
summary-jobs-running: 0
summary-jobs-waiting: 0
summary-known-jobs: 0
summary-workers-waiting: 4
summary-cancelled-jobs: 0
summary-jobs-finished: 4
--- goq end status at time: 2016-06-21 15:45:00.384001311 +0900 KST ---
#!/bin/bash
echo "sleep.sh begins sleeping for 20 seconds"
sleep 20
echo "sleep.sh done"
워커 2번에서 실행이 된걸 확인할 수 있음.
결과
/root/.goq/o
out.00005 # jid
sleep.sh begins sleeping for 20 seconds
sleep.sh done
** 마스터 서버
# vim /etc/exports
/BiO 10.10.10.0/24(rw,sync,no_root_squash)
# systemctl start rpcbind
# systemctl start nfs-server
# systemctl start nfs-idmap
# systemctl start nfs-lock
# systemctl enable nfs-server
** 노드
# showmount -e 10.10.10.10
# mount -t nfs 10.10.10.10:/BiO /BiO
# vim /etc/fstab
10.10.10.10:/BiO /BiO defaults 0 0
** hpl 벤치 준비 (failed...)
** ATLAS 설치 및 install
** hpl 은 추후 다시 설치 시도...
#!/bin/bash
echo "sleep 5"
sleep 5
cat $1 >> result
cat $1
#!/bin/bash
for i in $(seq 1 9); do
./run.sh $i
done
#!/bin/bash
for i in $(seq 1 9); do
goq sub ./run.sh $i
done
[root@goq test]# goq stat
[pid 8221] stats for job server 'tcp://10.10.10.10:1776':
runQlen=0
waitingJobs=0
waitingWorkers=4
jservPid=6641
finishedJobsCount=22
droppedBadSigCount=0
cancelledJobCount=0
nextJobId=24
jservIP=10.10.10.10
jservPort=1776
badNonceCount=0
maxShow=10
finished: [jid 14] total-time: 5.006814286s. done: 2016-06-22 16:46:53.858905226 +0900 KST. cmd: './run.sh [9]' finished on worker 'tcp://10.10.10.200:38134'/pid:9547. . Err: ''
finished: [jid 15] total-time: 5.009640992s. done: 2016-06-22 16:52:28.8312227 +0900 KST. cmd: './run.sh [1]' finished on worker 'tcp://10.10.10.200:52480'/pid:9678. . Err: ''
finished: [jid 16] total-time: 5.009856439s. done: 2016-06-22 16:52:28.822657713 +0900 KST. cmd: './run.sh [2]' finished on worker 'tcp://10.10.10.201:54694'/pid:9931. . Err: ''
finished: [jid 17] total-time: 5.014137871s. done: 2016-06-22 16:52:28.827363219 +0900 KST. cmd: './run.sh [3]' finished on worker 'tcp://10.10.10.201:54228'/pid:9933. . Err: ''
finished: [jid 18] total-time: 5.005063449s. done: 2016-06-22 16:52:28.866128228 +0900 KST. cmd: './run.sh [4]' finished on worker 'tcp://10.10.10.200:38134'/pid:9681. . Err: ''
finished: [jid 19] total-time: 5.007172282s. done: 2016-06-22 16:52:33.867240633 +0900 KST. cmd: './run.sh [5]' finished on worker 'tcp://10.10.10.200:52480'/pid:9686. . Err: ''
finished: [jid 20] total-time: 5.014624321s. done: 2016-06-22 16:52:33.858707659 +0900 KST. cmd: './run.sh [6]' finished on worker 'tcp://10.10.10.201:54694'/pid:9941. . Err: ''
finished: [jid 21] total-time: 5.015421181s. done: 2016-06-22 16:52:33.863655635 +0900 KST. cmd: './run.sh [7]' finished on worker 'tcp://10.10.10.201:54228'/pid:9942. . Err: ''
finished: [jid 22] total-time: 5.011072741s. done: 2016-06-22 16:52:33.896002578 +0900 KST. cmd: './run.sh [8]' finished on worker 'tcp://10.10.10.200:38134'/pid:9691. . Err: ''
finished: [jid 23] total-time: 5.002196052s. done: 2016-06-22 16:52:38.898022316 +0900 KST. cmd: './run.sh [9]' finished on worker 'tcp://10.10.10.200:52480'/pid:9696. . Err: ''
--- goq security status---
summary-bad-signature-msgs: 0
summary-bad-nonce-msg: 0
--- goq progress status ---
summary-jobs-running: 0
summary-jobs-waiting: 0
summary-known-jobs: 0
summary-workers-waiting: 4
summary-cancelled-jobs: 0
summary-jobs-finished: 22
--- goq end status at time: 2016-06-22 17:43:52.258494225 +0900 KST ---
[root@goq test]#
run.sh (/BiO/bwa_data/)
#!/bin/bash
for i in *.fasta
do
#echo $i
goq sub /BiO/apps/bwa-0.7.15/bwa index $i
done
[root@goq bwa_data]# goq stat
[pid 5300] stats for job server 'tcp://10.10.10.10:1776':
runQlen=0
waitingJobs=0
waitingWorkers=4
jservPid=6641
finishedJobsCount=77
droppedBadSigCount=0
cancelledJobCount=0
nextJobId=79
jservIP=10.10.10.10
jservPort=1776
badNonceCount=0
maxShow=10
finished: [jid 70] total-time: 25.652276979s. done: 2016-06-24 11:26:47.437507654 +0900 KST. cmd: '/BiO/apps/bwa-0.7.15/bwa [index dmel-3R-chromosome-r5.37.fasta]' finished on worker 'tcp://10.10.10.201:54228'/pid:25409. . Err: ''
finished: [jid 72] total-time: 2m57.995945527s. done: 2016-06-24 11:29:38.320234035 +0900 KST. cmd: '/BiO/apps/bwa-0.7.15/bwa [index dmel-all-chromosome-r5.37.fasta]' finished on worker 'tcp://10.10.10.200:52480'/pid:25095. . Err: ''
finished: [jid 67] total-time: 5m20.065814901s. done: 2016-06-24 11:31:20.588482645 +0900 KST. cmd: '/BiO/apps/bwa-0.7.15/bwa [index dmel-2R-aligned-r5.37.fasta]' finished on worker 'tcp://10.10.10.201:54694'/pid:25399. . Err: ''
finished: [jid 74] total-time: 2m32.706963928s. done: 2016-06-24 11:32:11.06134462 +0900 KST. cmd: '/BiO/apps/bwa-0.7.15/bwa [index dmel-all-gene_extended2000-r5.37.fasta]' finished on worker 'tcp://10.10.10.200:52480'/pid:25125. . Err: ''
finished: [jid 76] total-time: 5.257377818s. done: 2016-06-24 11:32:16.329604209 +0900 KST. cmd: '/BiO/apps/bwa-0.7.15/bwa [index dmel-U-chromosome-r5.37.fasta]' finished on worker 'tcp://10.10.10.200:52480'/pid:25209. . Err: ''
finished: [jid 69] total-time: 6m56.254709913s. done: 2016-06-24 11:33:07.201563569 +0900 KST. cmd: '/BiO/apps/bwa-0.7.15/bwa [index dmel-3R-aligned-r5.37.fasta]' finished on worker 'tcp://10.10.10.200:38134'/pid:25088. . Err: ''
finished: [jid 78] total-time: 14.240585455s. done: 2016-06-24 11:33:21.461607565 +0900 KST. cmd: '/BiO/apps/bwa-0.7.15/bwa [index dmel-X-chromosome-r5.37.fasta]' finished on worker 'tcp://10.10.10.200:38134'/pid:25214. . Err: ''
finished: [jid 77] total-time: 4m4.989334558s. done: 2016-06-24 11:36:21.328854204 +0900 KST. cmd: '/BiO/apps/bwa-0.7.15/bwa [index dmel-X-aligned-r5.37.fasta]' finished on worker 'tcp://10.10.10.200:52480'/pid:25211. . Err: ''
finished: [jid 75] total-time: 6m35.538950491s. done: 2016-06-24 11:37:56.176649363 +0900 KST. cmd: '/BiO/apps/bwa-0.7.15/bwa [index dmel-all-predicted-r5.37.fasta]' finished on worker 'tcp://10.10.10.201:54694'/pid:25542. . Err: ''
finished: [jid 73] total-time: 11m17.50777801s. done: 2016-06-24 11:38:04.953963217 +0900 KST. cmd: '/BiO/apps/bwa-0.7.15/bwa [index dmel-all-clones-r5.37.fasta]' finished on worker 'tcp://10.10.10.201:54228'/pid:25415. . Err: ''
--- goq security status---
summary-bad-signature-msgs: 0
summary-bad-nonce-msg: 0
--- goq progress status ---
summary-jobs-running: 0
summary-jobs-waiting: 0
summary-known-jobs: 0
summary-workers-waiting: 4
summary-cancelled-jobs: 0
summary-jobs-finished: 77
--- goq end status at time: 2016-06-24 13:19:11.903386522 +0900 KST ---
[root@goq bwa_data]#