summaryrefslogtreecommitdiffstats
path: root/roles/cuda
diff options
context:
space:
mode:
Diffstat (limited to 'roles/cuda')
-rw-r--r--roles/cuda/.travis.yml47
-rw-r--r--roles/cuda/LICENSE22
-rw-r--r--roles/cuda/README.md64
-rw-r--r--roles/cuda/defaults/main.yml23
-rw-r--r--roles/cuda/files/nvidia_packaging_key.asc29
-rw-r--r--roles/cuda/handlers/main.yml43
-rw-r--r--roles/cuda/meta/main.yml140
-rw-r--r--roles/cuda/tasks/configure_apt.yml15
-rw-r--r--roles/cuda/tasks/configure_yum.yml28
-rw-r--r--roles/cuda/tasks/cuda_init.yml48
-rw-r--r--roles/cuda/tasks/main.yml54
-rwxr-xr-xroles/cuda/templates/cuda.sh.j27
-rw-r--r--roles/cuda/templates/cuda_init.service.j213
-rw-r--r--roles/cuda/templates/cuda_init.sh.j29
-rw-r--r--roles/cuda/tests/install.yml23
-rw-r--r--roles/cuda/tests/inventory5
-rw-r--r--roles/cuda/tests/test.yml12
-rw-r--r--roles/cuda/vars/centos-6.yml4
-rw-r--r--roles/cuda/vars/centos-7.yml4
-rw-r--r--roles/cuda/vars/fedora-28.yml4
-rw-r--r--roles/cuda/vars/main.yml5
-rw-r--r--roles/cuda/vars/redhat-6.yml4
-rw-r--r--roles/cuda/vars/redhat-7.yml4
-rw-r--r--roles/cuda/vars/ubuntu-14.04.yml4
-rw-r--r--roles/cuda/vars/ubuntu-14.10.yml4
-rw-r--r--roles/cuda/vars/ubuntu-15.yml4
-rw-r--r--roles/cuda/vars/ubuntu-16.04.yml4
-rw-r--r--roles/cuda/vars/ubuntu-17.04.yml4
28 files changed, 627 insertions, 0 deletions
diff --git a/roles/cuda/.travis.yml b/roles/cuda/.travis.yml
new file mode 100644
index 0000000..0cdcab4
--- /dev/null
+++ b/roles/cuda/.travis.yml
@@ -0,0 +1,47 @@
+---
+language: python
+sudo: required
+dist: trusty
+
+matrix:
+ fast_finish: true
+
+cache:
+ directories: [ '$HOME/lxc/' ]
+ pip: true
+
+before_cache:
+ - sudo mkdir $HOME/lxc && sudo tar cf $HOME/lxc/cache.tar /var/cache/lxc/ && sudo chown $USER. $HOME/lxc/cache.tar
+
+env:
+ - LXC_DISTRO=ubuntu LXC_RELEASE=xenial
+ - LXC_DISTRO=ubuntu LXC_RELEASE=trusty
+ - LXC_DISTRO=centos LXC_RELEASE=7
+ - LXC_DISTRO=centos LXC_RELEASE=6
+
+install:
+ - sudo tar xf $HOME/lxc/cache.tar -C / || true
+ - sudo apt-get install -y expect-dev
+ - pip install ansible
+ - ansible --version
+ - printf '[defaults]\nroles_path=../\ncallback_whitelist=profile_tasks' >ansible.cfg
+ - ansible-galaxy install lae.travis-lxc
+ - ansible-playbook -vvv tests/install.yml -i tests/inventory
+
+script:
+ # Basic role syntax check
+ - ansible-playbook tests/test.yml -i tests/inventory --syntax-check
+
+ # Perform a test run with the playbook
+ - travis_wait ansible-playbook tests/test.yml -i tests/inventory
+
+ # Perform a another test run with the playbook to check for idempotency
+ - 'unbuffer ansible-playbook tests/test.yml -i tests/inventory >/tmp/idempotency.log 2>&1'
+ - 'grep -A1 "PLAY RECAP" /tmp/idempotency.log | grep -qP "changed=0.*failed=0" &&
+ (echo "Idempotence: PASS"; exit 0) || (echo "Idempotence: FAIL"; cat /tmp/idempotency.log;
+ exit 1)'
+
+notifications:
+ email: false
+ flowdock:
+ secure: "lxqZTTUCUYy19JzwXnH/kRcndYwNasUYpa0AK0vcRqTRSoULRzwE2jcJUk2YBz0qTBcTgAkinj1VQbImdrl68NTPDTmOZM4+hPZ8RQNTGR7VJwy4Ynjl/RtvxmwvoW/kSZJI3twDvPpAl2yEKfiPwSE4kYNFs84w43WieNhX3qO0LN9EdFykV6M0xeZgGc71v6oGof3n9HhBMfMUYU6YZJKvirHJNwAxHsWiFSq+dfDA5hqDyilwuQM0toNgjsFi6F7b40vOVJPGJKdSNekrFgp2Gm/Fzd83sO/Sp1ord9v73UflshejKuK2/iMRddPW5JDl05FNrJ7x6xcCfK9sxOj5KDELotagHCDdCTqX4USelBpI7DeO9yV8NAIxyL2KAFi5b3uwvF5reitGhRfdeqA0B9eK+k6vdRQ/xKryYc48hVX46wraL2ibZv2gbic7vpYdxiWFUirKB9NBoQu7JHkTT/LT3LjbC9/uL9c4qRyakAnQwhgi4/sEk7f9euvtZA6MJfZpCzfiYVt3rGe6H9HqcCflnxW5F1ZjLBSkHk02rNn6hcfFxGHSS5x1362F9JCwVkAWgJ43JvQLRxobW4htbx56+niX0zS+vs2kK5K3NxUCJzInUb4UV5/9lwcCBZQJL0fD4u2Gy8/TC7MrhhOVxkSSPmjjoonaRPY497c="
diff --git a/roles/cuda/LICENSE b/roles/cuda/LICENSE
new file mode 100644
index 0000000..8b0c2e6
--- /dev/null
+++ b/roles/cuda/LICENSE
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 CSC - IT Center for Science
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/roles/cuda/README.md b/roles/cuda/README.md
new file mode 100644
index 0000000..d28e484
--- /dev/null
+++ b/roles/cuda/README.md
@@ -0,0 +1,64 @@
+[![Build Status](https://travis-ci.org/CSCfi/ansible-role-cuda.svg)](https://travis-ci.org/CSCfi/ansible-role-cuda)
+[![Galaxy Role](https://img.shields.io/badge/ansible--galaxy-cuda-blue.svg)](https://galaxy.ansible.com/CSCfi/cuda/)
+
+ansible-role-cuda
+=========
+
+Installs CUDA
+
+Tested with Tesla P100, K80, Tesla M40, CentOS7, Ubuntu 16.04, Cuda 7.5 and 8.0
+
+Optionally also installs cuda_init which initializes the GPUs during boot.
+
+Requirements
+------------
+
+Outbound access to http://developer.download.nvidia.com/compute/cuda/repos/
+
+Role Variables
+--------------
+
+ gpu: False
+ cuda_packages:
+ - cuda
+ cuda_restart_node_on_install: True
+ cuda_init: True
+ cuda_bash_profile: True
+
+- gpu: True is needed. Without it this role does nothing.
+- cuda_packages: List that can be updated to include more packages that are installed after nvidia cuda repo is installed, or to a specific cuda package (e.g. `cuda-7-5`)
+- cuda_init: Installs a bash script that is executed via systemd
+- cuda_gpu_name0: "/dev/nvidia0" # set this to the device ansible looks for. If it does not exist then if cuda_init is True then it will run the cuda_init.sh script
+- cuda_restart_node_on_install: restarts the system when packages are installed or updated
+
+
+Example Playbook
+----------------
+
+`playbook.yml`:
+
+ - hosts: deep_learning
+ roles:
+ - CSC-IT-Center-for-Science.cuda
+
+`inventory`:
+
+ [deep_learning]
+ host1.example gpu=True
+
+Example Errors
+--------------
+
+This error means you are not using a supported OS (like Ubuntu 17.04 which does not have a cuda URL)
+<pre>
+ "msg": "No file was found when using with_first_found. Use the 'skip: true' option to allow this task to be skipped if no files are found"
+</pre>
+
+License
+-------
+
+MIT
+
+Author Information
+------------------
+
diff --git a/roles/cuda/defaults/main.yml b/roles/cuda/defaults/main.yml
new file mode 100644
index 0000000..45ceb29
--- /dev/null
+++ b/roles/cuda/defaults/main.yml
@@ -0,0 +1,23 @@
+---
+# defaults file for ansible-role-cuda
+# By default gpu is False, set it to True on the gpu nodes in the hosts file
+
+gpu: True
+cuda_repo_url: "http://developer.download.nvidia.com/compute/cuda/repos/"
+cuda_rpm_key_path: /etc/rpm/nvidia_packaging_key.asc
+cuda_packages:
+ - cuda
+cuda_restart_node_on_install: False
+cuda_init: True
+cuda_init_restart_service: False
+cuda_init_compute_mode: 3
+cuda_init_persistence_mode: 1
+cuda_gpu_name0: "/dev/nvidia0"
+cuda_bash_profile: True
+cuda_bash_path: "/usr/local/cuda/bin"
+cuda_bash_ld_lib_path: "/usr/local/cuda/lib64"
+cuda_bash_cuda_root: "/usr/local/cuda"
+cuda_bash_cuda_inc_dir: "/usr/local/cuda/bin"
+cuda_bash_cpath: "/usr/local/cuda/include"
+
+# vim:ft=ansible:
diff --git a/roles/cuda/files/nvidia_packaging_key.asc b/roles/cuda/files/nvidia_packaging_key.asc
new file mode 100644
index 0000000..0c44d5a
--- /dev/null
+++ b/roles/cuda/files/nvidia_packaging_key.asc
@@ -0,0 +1,29 @@
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+Version: GnuPG v1
+
+mQINBFdtt4UBEAC8FDSWMR07GJZ265giLn7kLF+EsJCWESUq6Cd13QN0JQ/tLibi
+QlW4ZjeOnEH9VPlqh/mKqNMG4SwRt8S+GHpePMQrr0aOkiRGfCclnAWIZURSAP+t
+PLelCt43fkw1BBTopd/0oOzO8kHu8j8WU4A8GHxqghfFWPv54FQs2iaZ2eWR7a6d
+79IJrbDKaVCCiQrkhCM8m648pNKHhuoJ9cQXFV+uvwkpfmKWGQ4ultxlOyjLHJLF
+vuML2RuAO9IxbdZjzeYNN+T+wjFIBVcPnwEO+WrYgvGkT4r9aqVqTeg3EPb7QclV
+sKBVJdxk4jZl0y22HAWqScVi6SJ15uK9pXxywDZkbpuRBWx4ThWiGe/FiUa2igi9
+/SIvqN2TBY0g18sRTrylVr1wE1UGa/y7nDx6PoGCP1frBt8YUYt3pkM8Xvb2CRxx
+CyWwmuFEQHC6jCEWf7FnoBHBYQwTVGNrU0vkuIeDrm+ZAcv8wx+ie1hlFhqCCJnf
+jqeQ0/zA9RPmCPOkLyTdSsNZtlxxk7bzCdTdFFKzBjGTR7Gz3SMSp23d11eIyRiF
+HQsp2v0SvnPJ6OcgB95Hmo544vi3RuoVfovtDOdfSBCRxP+GhhxkKSrTleQjD0/r
+CGkdG2Kox3m9YllAsvZchLXlS7bZV9mGRF61mVMjF3HJRUQfBBm89VPQ+QARAQAB
+tCBjdWRhdG9vbHMgPGN1ZGF0b29sc0BudmlkaWEuY29tPokCNwQTAQgAIQUCV223
+hQIbAwULCQgHAwUVCgkICwUWAgMBAAIeAQIXgAAKCRD2D0s9f6KvgNArEAChnfcW
+rYItgt7xXXubT6E+KpJyJ0RPrXf51S2mhciFbjDl+3EXRMRjOutVmgWYPWUUZaKR
+8Iez3Lz4BRmwYOWBLtdnOLbKoSsQUX95rnPFjfly/DFLfjKxz4NRBmh4r4/rCYWm
+2hmnXmOAi8kV7fqx3g5XMpJ//N6+T8ctEol2iZ82GrXjadcRWE4rAe7UyuEzJ74y
+6ZKIzk5ijdgEKtcaBhzEWvoV5Pr9nkn7ByGsdehKR/gNnjPMYXrklSHGfphJIsS2
+S32lMk/kuRjihBcWcYBXIPEQ7CV+PNW2TlkZj/YqTg637sZHwkhcjcNzxeqKvRYG
+8V7Ju5hTDxL1UQBmgDS3cRx1lw7tYRG5bS67tbC2dc/CpPkG5agiZ/WyoHQDnn4r
+1fRuOFx694QR6+0rAP6171xEEoNAPaH7gdJdhWKiYiJD0T2EEbW7wBUi/EupeKRv
+kR12R1jUa1mlpxNtWQxJ7qp98T9+DmkxI1XDmWx0/g4ryuicwLDSqoPgNcRNdSQb
+b8YfTqrkqaDdYzwLr/n0YKW3cYIvIeisV0WxRjb6OP7oAlAtaAhImlIc//51qNO7
+/WAud6qMtnhFoZayR/BzLKqnCioN5GYr9BAKskpPHe9cDKVS3fg+Qvc1sNJID+jf
+k52PqyW24Qsr0A9+5zQyE4tH9dfv120gj9avmg==
+=0nKc
+-----END PGP PUBLIC KEY BLOCK-----
diff --git a/roles/cuda/handlers/main.yml b/roles/cuda/handlers/main.yml
new file mode 100644
index 0000000..adba3b7
--- /dev/null
+++ b/roles/cuda/handlers/main.yml
@@ -0,0 +1,43 @@
+---
+# handlers file for ansible-role-cuda
+- name: reload systemd unit files
+ shell: systemctl daemon-reload
+
+- name: Initialize the GPUs
+ command: /bin/bash /usr/local/bin/cuda_init.sh
+ when:
+ - cuda_init
+ - cuda_init_restart_service
+
+- name: Restart cuda_init service
+ service:
+ name: cuda_init
+ state: restarted
+ when:
+ - cuda_init
+ - cuda_init_restart_service
+ - ansible_service_mgr == "systemd"
+
+- name: ZZ CUDA Restart server
+ command: sleep 2 && /sbin/shutdown -r now "Node software upgrade reboot"
+ async: 1
+ poll: 0
+ ignore_errors: true
+ when:
+ - cuda_packages_installation.changed
+ - cuda_restart_node_on_install
+
+# define the variable running_as_ansible_pull in the ansible-pull playbook, like local.yml
+- name: ZZ CUDA Wait for server to restart
+ wait_for:
+ host: "{{ ansible_ssh_host | default(inventory_hostname) }}"
+ state: started
+ delay: 30
+ timeout: 300
+ connection: local
+ become: false
+ when:
+ - cuda_restart_node_on_install
+ - (running_as_ansible_pull is not defined or running_as_ansible_pull == False)
+
+# vim:ft=ansible:
diff --git a/roles/cuda/meta/main.yml b/roles/cuda/meta/main.yml
new file mode 100644
index 0000000..f103de5
--- /dev/null
+++ b/roles/cuda/meta/main.yml
@@ -0,0 +1,140 @@
+---
+galaxy_info:
+ author: Johan Guldmyr
+ description: Installs CUDA
+ company: CSC - IT Center for Science
+ # If the issue tracker for your role is not on github, uncomment the
+ # next line and provide a value
+ # issue_tracker_url: http://example.com/issue/tracker
+ # Some suggested licenses:
+ # - BSD (default)
+ # - MIT
+ # - GPLv2
+ # - GPLv3
+ # - Apache
+ # - CC-BY
+ license: MIT
+ min_ansible_version: 2.4
+ #
+ # Below are all platforms currently available. Just uncomment
+ # the ones that apply to your role. If you don't see your
+ # platform on this list, let us know and we'll get it added!
+ #
+ platforms:
+ - name: EL
+ versions:
+ # - all
+ # - 5
+ - 6
+ - 7
+ #- name: GenericUNIX
+ # versions:
+ # - all
+ # - any
+ #- name: Fedora
+ # versions:
+ # - all
+ # - 16
+ # - 17
+ # - 18
+ # - 19
+ # - 20
+ # - 21
+ # - 22
+ #- name: Windows
+ # versions:
+ # - all
+ # - 2012R2
+ #- name: SmartOS
+ # versions:
+ # - all
+ # - any
+ #- name: opensuse
+ # versions:
+ # - all
+ # - 12.1
+ # - 12.2
+ # - 12.3
+ # - 13.1
+ # - 13.2
+ #- name: Amazon
+ # versions:
+ # - all
+ # - 2013.03
+ # - 2013.09
+ #- name: GenericBSD
+ # versions:
+ # - all
+ # - any
+ #- name: FreeBSD
+ # versions:
+ # - all
+ # - 8.0
+ # - 8.1
+ # - 8.2
+ # - 8.3
+ # - 8.4
+ # - 9.0
+ # - 9.1
+ # - 9.1
+ # - 9.2
+ - name: Ubuntu
+ versions:
+ # - all
+ # - lucid
+ # - maverick
+ # - natty
+ # - oneiric
+ # - precise
+ # - quantal
+ # - raring
+ # - saucy
+ - trusty
+ # - utopic
+ # - vivid
+ - xenial
+ #- name: SLES
+ # versions:
+ # - all
+ # - 10SP3
+ # - 10SP4
+ # - 11
+ # - 11SP1
+ # - 11SP2
+ # - 11SP3
+ #- name: GenericLinux
+ # versions:
+ # - all
+ # - any
+ #- name: Debian
+ # versions:
+ # - all
+ # - etch
+ # - jessie
+ # - lenny
+ # - squeeze
+ # - wheezy
+ #
+ # Below are all categories currently available. Just as with
+ # the platforms above, uncomment those that apply to your role.
+ #
+ categories:
+ #- cloud
+ #- cloud:ec2
+ #- cloud:gce
+ #- cloud:rax
+ #- clustering
+ #- database
+ #- database:nosql
+ #- database:sql
+ #- development
+ #- monitoring
+ #- networking
+ #- packaging
+ - system
+ #- web
+dependencies: []
+ # List your role dependencies here, one per line.
+ # Be sure to remove the '[]' above if you add dependencies
+ # to this list.
+
diff --git a/roles/cuda/tasks/configure_apt.yml b/roles/cuda/tasks/configure_apt.yml
new file mode 100644
index 0000000..53a38a5
--- /dev/null
+++ b/roles/cuda/tasks/configure_apt.yml
@@ -0,0 +1,15 @@
+---
+# tasks file for ansible-role-cuda
+- name: Trust packaging key for Nvidia repositories (apt)
+ apt_key:
+ data: "{{ lookup('file', 'files/nvidia_packaging_key.asc') }}"
+ id: "{{ cuda_packaging_key_id }}"
+ state: present
+
+- name: Configure Nvidia repository (apt)
+ apt_repository:
+ repo: "deb {{ cuda_repo_url }}/{{ cuda_repo_subfolder }}/x86_64 /"
+ filename: nvidia
+ state: present
+
+# vim:ft=ansible:
diff --git a/roles/cuda/tasks/configure_yum.yml b/roles/cuda/tasks/configure_yum.yml
new file mode 100644
index 0000000..e888468
--- /dev/null
+++ b/roles/cuda/tasks/configure_yum.yml
@@ -0,0 +1,28 @@
+---
+# tasks file for ansible-role-cuda
+- name: Upload packaging key for Nvidia repositories
+ copy:
+ src: nvidia_packaging_key.asc
+ dest: "{{ cuda_rpm_key_path }}"
+ mode: 0644
+
+- name: Trust packaging key for Nvidia repositories (rpm)
+ rpm_key:
+ key: "{{ cuda_rpm_key_path }}"
+ state: present
+
+- name: Remove trust for old Nvidia packaging key
+ rpm_key:
+ key: 5C37D3BE
+ state: absent
+
+- name: Configure Nvidia repository (yum)
+ yum_repository:
+ name: nvidia
+ description: Official Nvidia repository
+ baseurl: "{{ cuda_repo_url }}/{{ cuda_repo_subfolder }}/x86_64/"
+ gpgkey: "file://{{ cuda_rpm_key_path }}"
+ gpgcheck: yes
+ enabled: yes
+
+# vim:ft=ansible:
diff --git a/roles/cuda/tasks/cuda_init.yml b/roles/cuda/tasks/cuda_init.yml
new file mode 100644
index 0000000..ff54994
--- /dev/null
+++ b/roles/cuda/tasks/cuda_init.yml
@@ -0,0 +1,48 @@
+---
+- name: template in cuda_init.sh used during boot
+ template:
+ src: cuda_init.sh.j2
+ dest: /usr/local/bin/cuda_init.sh
+ mode: 0755
+ notify:
+ - Initialize the GPUs
+
+- name: lineinfile/make sure cuda_init.sh script is absent from rc.local
+ lineinfile:
+ dest: /etc/rc.local
+ insertafter: "^touch /var/lock/subsys/local"
+ regexp: "^/bin/bash /usr/local/bin/cuda_init.sh$"
+ line: "/bin/bash /usr/local/bin/cuda_init.sh"
+ state: absent
+
+- name: template in cuda_init.service systemd script
+ template:
+ src: cuda_init.service.j2
+ dest: /etc/systemd/system/cuda_init.service
+ mode: 0644
+ notify:
+ - reload systemd unit files
+ - Restart cuda_init service
+ when: ansible_service_mgr == "systemd"
+
+- name: enable the cuda_init systemd service
+ service:
+ name: cuda_init
+ enabled: yes
+ when: ansible_service_mgr == "systemd"
+
+- name: check if cuda_gpu_name0 ( /dev/nvidia0 ) exists
+ stat:
+ path: "{{ cuda_gpu_name0 }}"
+ register: reg_cuda_gpu_name0
+ check_mode: no
+ failed_when: false
+
+- name: Initialize the GPUs - run cuda_init.sh if there is no /dev/nvidia0
+ command: /bin/bash /usr/local/bin/cuda_init.sh
+ when:
+ - reg_cuda_gpu_name0.stat.exists is defined
+ - reg_cuda_gpu_name0.stat.exists == False
+ - cuda_init_restart_service
+
+# vim:ft=ansible:
diff --git a/roles/cuda/tasks/main.yml b/roles/cuda/tasks/main.yml
new file mode 100644
index 0000000..f292f67
--- /dev/null
+++ b/roles/cuda/tasks/main.yml
@@ -0,0 +1,54 @@
+---
+# tasks file for ansible-role-cuda
+- name: "Gather OS specific variables"
+ include_vars: "{{ item }}"
+ with_first_found:
+ - "{{ ansible_distribution|lower }}-{{ ansible_distribution_version }}.yml"
+ - "{{ ansible_distribution|lower }}-{{ ansible_distribution_major_version }}.yml"
+ - "{{ ansible_distribution|lower }}.yml"
+ - "{{ ansible_os_family|lower }}.yml"
+
+- block:
+ - include_tasks: configure_yum.yml
+ when: ansible_pkg_mgr == 'yum' or ansible_pkg_mgr == 'dnf'
+
+ - include_tasks: configure_apt.yml
+ when: ansible_pkg_mgr == 'apt'
+
+ - name: Install kernel development files
+ package: name=kernel-devel state=present
+ register: result
+
+ - name: Synchronize kernel and kernel-devel packages
+ package: name=kernel state=latest
+ when: (result | changed)
+
+ - name: Install CUDA and related packages (1.5-2GB download, also restarts if cuda_restart_node_on_install is set to True)
+ package:
+ name: "{{ item }}"
+ state: present
+ with_items: "{{ cuda_packages }}"
+ register: cuda_packages_installation
+ notify:
+ - ZZ CUDA Restart server
+ - ZZ CUDA Wait for server to restart
+
+ - name: Template CUDA paths to user environments
+ template:
+ src: cuda.sh.j2
+ dest: /etc/profile.d/cuda.sh
+ mode: 0755
+ when: cuda_bash_profile
+
+ - include_tasks: cuda_init.yml
+ when: cuda_init == True
+
+ # This is here because if we in the same playbook try to start slurmd without
+ # having run the cuda_init.sh script then slurmd doesn't start and the play fails.
+ # todo: reload nvidia modules/etc instead of restart
+ - name: flush the handlers - so that the node is rebooted after CUDA is installed and that the GPUs are initialized before we start slurm
+ meta: flush_handlers
+
+ when: gpu == True
+
+# vim:ft=ansible:
diff --git a/roles/cuda/templates/cuda.sh.j2 b/roles/cuda/templates/cuda.sh.j2
new file mode 100755
index 0000000..78393a1
--- /dev/null
+++ b/roles/cuda/templates/cuda.sh.j2
@@ -0,0 +1,7 @@
+#!/bin/bash
+# {{ ansible_managed }}
+export PATH={{ cuda_bash_path }}:$PATH
+export LD_LIBRARY_PATH={{ cuda_bash_ld_lib_path }}:$LD_LIBRARY_PATH
+export CUDA_ROOT={{ cuda_bash_cuda_root }}
+export CUDA_INC_DIR={{ cuda_bash_cuda_inc_dir }}:$CUDA_INC_DIR
+export CPATH={{ cuda_bash_cpath }}:$CPATH
diff --git a/roles/cuda/templates/cuda_init.service.j2 b/roles/cuda/templates/cuda_init.service.j2
new file mode 100644
index 0000000..c61cbad
--- /dev/null
+++ b/roles/cuda/templates/cuda_init.service.j2
@@ -0,0 +1,13 @@
+[Unit]
+Description=Initialize nvidia/cuda devices
+Before=slurm.service
+DefaultDependencies=no
+Conflicts=shutdown.target
+
+[Service]
+ExecStart=/usr/local/bin/cuda_init.sh
+Type=oneshot
+RemainAfterExit=yes
+
+[Install]
+WantedBy=multi-user.target
diff --git a/roles/cuda/templates/cuda_init.sh.j2 b/roles/cuda/templates/cuda_init.sh.j2
new file mode 100644
index 0000000..3c85ea2
--- /dev/null
+++ b/roles/cuda/templates/cuda_init.sh.j2
@@ -0,0 +1,9 @@
+#!/bin/bash
+# During boot we initialize the GPUs (creates /dev/nvidia*)
+
+if [ ! -f "/usr/bin/nvidia-smi" ]; then
+ logger -s -t nvidia-smi "Script $0 could not find /usr/bin/nvidia-smi"
+else
+ /usr/bin/nvidia-smi --compute-mode={{ cuda_init_compute_mode }}
+ /usr/bin/nvidia-smi --persistence-mode={{ cuda_init_persistence_mode }}
+fi
diff --git a/roles/cuda/tests/install.yml b/roles/cuda/tests/install.yml
new file mode 100644
index 0000000..03b9360
--- /dev/null
+++ b/roles/cuda/tests/install.yml
@@ -0,0 +1,23 @@
+---
+- hosts: localhost
+ connection: local
+ roles:
+ - { name: lae.travis-lxc }
+ vars:
+ host_quantity: 1
+
+# Run the following within the containers in the inventory
+- hosts: all
+ tasks:
+ # Solution for avahi-daemon issue from https://github.com/lxc/lxc/issues/25
+ - block:
+ - name: Install avahi-daemon early on Ubuntu 16 containers
+ package:
+ name: avahi-daemon
+ ignore_errors: True
+ - name: Remove nproc from avahi-daemon.conf
+ lineinfile:
+ dest: /etc/avahi/avahi-daemon.conf
+ regexp: "^rlimit-nproc="
+ state: absent
+ when: "ansible_distribution_release == 'xenial'"
diff --git a/roles/cuda/tests/inventory b/roles/cuda/tests/inventory
new file mode 100644
index 0000000..e45e40f
--- /dev/null
+++ b/roles/cuda/tests/inventory
@@ -0,0 +1,5 @@
+[cuda]
+test01.lxc
+
+[cuda:vars]
+ansible_ssh_user=root
diff --git a/roles/cuda/tests/test.yml b/roles/cuda/tests/test.yml
new file mode 100644
index 0000000..5125bda
--- /dev/null
+++ b/roles/cuda/tests/test.yml
@@ -0,0 +1,12 @@
+---
+- hosts: all
+ become: True
+ roles:
+ - ansible-role-cuda
+ vars:
+ - gpu: True
+ - cuda_restart_node_on_install: False
+ - cuda_init: True
+ - cuda_init_restart_service: False
+
+# vim:ft=ansible:
diff --git a/roles/cuda/vars/centos-6.yml b/roles/cuda/vars/centos-6.yml
new file mode 100644
index 0000000..c4322ae
--- /dev/null
+++ b/roles/cuda/vars/centos-6.yml
@@ -0,0 +1,4 @@
+---
+cuda_repo_subfolder: rhel6
+
+# vim:ft=ansible: \ No newline at end of file
diff --git a/roles/cuda/vars/centos-7.yml b/roles/cuda/vars/centos-7.yml
new file mode 100644
index 0000000..b331a96
--- /dev/null
+++ b/roles/cuda/vars/centos-7.yml
@@ -0,0 +1,4 @@
+---
+cuda_repo_subfolder: rhel7
+
+# vim:ft=ansible: \ No newline at end of file
diff --git a/roles/cuda/vars/fedora-28.yml b/roles/cuda/vars/fedora-28.yml
new file mode 100644
index 0000000..e8af2d0
--- /dev/null
+++ b/roles/cuda/vars/fedora-28.yml
@@ -0,0 +1,4 @@
+---
+cuda_repo_subfolder: fedora27
+
+# vim:ft=ansible: \ No newline at end of file
diff --git a/roles/cuda/vars/main.yml b/roles/cuda/vars/main.yml
new file mode 100644
index 0000000..ded4141
--- /dev/null
+++ b/roles/cuda/vars/main.yml
@@ -0,0 +1,5 @@
+---
+# vars file for ansible-role-cuda
+cuda_packaging_key_id: 7FA2AF80
+
+# vim:ft=ansible: \ No newline at end of file
diff --git a/roles/cuda/vars/redhat-6.yml b/roles/cuda/vars/redhat-6.yml
new file mode 100644
index 0000000..c4322ae
--- /dev/null
+++ b/roles/cuda/vars/redhat-6.yml
@@ -0,0 +1,4 @@
+---
+cuda_repo_subfolder: rhel6
+
+# vim:ft=ansible: \ No newline at end of file
diff --git a/roles/cuda/vars/redhat-7.yml b/roles/cuda/vars/redhat-7.yml
new file mode 100644
index 0000000..b331a96
--- /dev/null
+++ b/roles/cuda/vars/redhat-7.yml
@@ -0,0 +1,4 @@
+---
+cuda_repo_subfolder: rhel7
+
+# vim:ft=ansible: \ No newline at end of file
diff --git a/roles/cuda/vars/ubuntu-14.04.yml b/roles/cuda/vars/ubuntu-14.04.yml
new file mode 100644
index 0000000..61d04fd
--- /dev/null
+++ b/roles/cuda/vars/ubuntu-14.04.yml
@@ -0,0 +1,4 @@
+---
+cuda_repo_subfolder: ubuntu1404
+
+# vim:ft=ansible: \ No newline at end of file
diff --git a/roles/cuda/vars/ubuntu-14.10.yml b/roles/cuda/vars/ubuntu-14.10.yml
new file mode 100644
index 0000000..8c8e53e
--- /dev/null
+++ b/roles/cuda/vars/ubuntu-14.10.yml
@@ -0,0 +1,4 @@
+---
+cuda_repo_subfolder: ubuntu1410
+
+# vim:ft=ansible: \ No newline at end of file
diff --git a/roles/cuda/vars/ubuntu-15.yml b/roles/cuda/vars/ubuntu-15.yml
new file mode 100644
index 0000000..d1f56ad
--- /dev/null
+++ b/roles/cuda/vars/ubuntu-15.yml
@@ -0,0 +1,4 @@
+---
+cuda_repo_subfolder: ubuntu1504
+
+# vim:ft=ansible: \ No newline at end of file
diff --git a/roles/cuda/vars/ubuntu-16.04.yml b/roles/cuda/vars/ubuntu-16.04.yml
new file mode 100644
index 0000000..f948a1a
--- /dev/null
+++ b/roles/cuda/vars/ubuntu-16.04.yml
@@ -0,0 +1,4 @@
+---
+cuda_repo_subfolder: ubuntu1604
+
+# vim:ft=ansible: \ No newline at end of file
diff --git a/roles/cuda/vars/ubuntu-17.04.yml b/roles/cuda/vars/ubuntu-17.04.yml
new file mode 100644
index 0000000..356467f
--- /dev/null
+++ b/roles/cuda/vars/ubuntu-17.04.yml
@@ -0,0 +1,4 @@
+---
+cuda_repo_subfolder: ubuntu1704
+
+# vim:ft=ansible: