1*31b31015Sliushiwei // SPDX-License-Identifier: BSD-2-Clause 2*31b31015Sliushiwei /* 3*31b31015Sliushiwei * Copyright (c) 2020, Huawei Technologies Co., Ltd 4*31b31015Sliushiwei */ 5*31b31015Sliushiwei /* 6*31b31015Sliushiwei * Support for Thread-Local Storage (TLS) ABIs for ARMv7/Aarch32 and Aarch64. 7*31b31015Sliushiwei * 8*31b31015Sliushiwei * TAs are currently single-threaded, so the only benefit of implementing these 9*31b31015Sliushiwei * ABIs is to support toolchains that need them even when the target program is 10*31b31015Sliushiwei * single-threaded. Such as, the g++ compiler from the GCC toolchain targeting a 11*31b31015Sliushiwei * "Posix thread" Linux runtime, which OP-TEE has been using for quite some time 12*31b31015Sliushiwei * (arm-linux-gnueabihf-* and aarch64-linux-gnu-*). This allows building C++ TAs 13*31b31015Sliushiwei * without having to build a specific toolchain with --disable-threads. 14*31b31015Sliushiwei * 15*31b31015Sliushiwei * This implementation is based on [1]. 16*31b31015Sliushiwei * 17*31b31015Sliushiwei * - "TLS data structures variant 1" (section 3): the AArch64 compiler uses the 18*31b31015Sliushiwei * TPIDR_EL0 to access TLS data directly. This assumes a specific layout for 19*31b31015Sliushiwei * the TCB, and (for shared objects) the use of R_AARCH64_TLS_TPREL 20*31b31015Sliushiwei * relocations. 21*31b31015Sliushiwei * - The "General Dynamic access model" (section 4.1): the ARMv7/Aarch32 22*31b31015Sliushiwei * compiler inserts calls to the __tls_get_addr() function which has to be 23*31b31015Sliushiwei * implemented by the runtime library. The function takes a module ID and an 24*31b31015Sliushiwei * offset parameter, which are provided thanks to R_ARM_TLS_DTPMOD32 and 25*31b31015Sliushiwei * R_ARM_TLS_DTPOFF32 relocations. 26*31b31015Sliushiwei * 27*31b31015Sliushiwei * In addition, dl_iterate_phdr() is implemented here, because it is used by the 28*31b31015Sliushiwei * g++ Aarch64 exception handling and it does use the TCB to provide TLS 29*31b31015Sliushiwei * information to the caller. 30*31b31015Sliushiwei * 31*31b31015Sliushiwei * [1] "ELF Handling For Thread-Local Storage" 32*31b31015Sliushiwei * https://www.akkadia.org/drepper/tls.pdf 33*31b31015Sliushiwei */ 34*31b31015Sliushiwei 35*31b31015Sliushiwei #include <arm64_user_sysreg.h> 36*31b31015Sliushiwei #include <assert.h> 37*31b31015Sliushiwei #include <link.h> 38*31b31015Sliushiwei #include <stdlib.h> 39*31b31015Sliushiwei #include <string.h> 40*31b31015Sliushiwei #include <sys/queue.h> 41*31b31015Sliushiwei #include "user_ta_header.h" 42*31b31015Sliushiwei 43*31b31015Sliushiwei /* DTV - Dynamic Thread Vector 44*31b31015Sliushiwei * 45*31b31015Sliushiwei * Maintains an array of pointers to TLS data for each module in the TCB. Each 46*31b31015Sliushiwei * module that has a TLS segment has an entry (and consequently, some space in 47*31b31015Sliushiwei * the tcb_head::tls buffer). The index is the "module ID". 48*31b31015Sliushiwei * dtv[0].size is the number of elements in the vector 49*31b31015Sliushiwei * dtv[1].tls points to TLS for the main executable (may be NULL) 50*31b31015Sliushiwei * tls[2 .. (size-1)] are for shared libraries 51*31b31015Sliushiwei */ 52*31b31015Sliushiwei union dtv { 53*31b31015Sliushiwei unsigned long size; 54*31b31015Sliushiwei uint8_t *tls; 55*31b31015Sliushiwei }; 56*31b31015Sliushiwei 57*31b31015Sliushiwei #define DTV_SIZE(size) (sizeof(union dtv) + (size)) 58*31b31015Sliushiwei 59*31b31015Sliushiwei /* Thread Control Block */ 60*31b31015Sliushiwei struct tcb_head { 61*31b31015Sliushiwei /* Two words are reserved as per the "TLS variant 1" ABI */ 62*31b31015Sliushiwei union dtv *dtv; 63*31b31015Sliushiwei unsigned long reserved; 64*31b31015Sliushiwei /* 65*31b31015Sliushiwei * The rest of the structure contains the TLS blocks for each ELF module 66*31b31015Sliushiwei * having a PT_TLS segment. Each block is a copy of the .tdata section 67*31b31015Sliushiwei * plus some zero-initialized space for .tbss. 68*31b31015Sliushiwei */ 69*31b31015Sliushiwei uint8_t tls[]; 70*31b31015Sliushiwei }; 71*31b31015Sliushiwei 72*31b31015Sliushiwei /* 73*31b31015Sliushiwei * Since TAs are single threaded, only one TCB is needed. This would need to 74*31b31015Sliushiwei * change if multi-threading is introduced. 75*31b31015Sliushiwei */ 76*31b31015Sliushiwei static struct tcb_head *_tcb; 77*31b31015Sliushiwei static size_t _tls_size; 78*31b31015Sliushiwei 79*31b31015Sliushiwei #define TCB_SIZE(tls_size) (sizeof(*_tcb) + (tls_size)) 80*31b31015Sliushiwei 81*31b31015Sliushiwei /* 82*31b31015Sliushiwei * Initialize or update the TCB. 83*31b31015Sliushiwei * Called on application initialization and when additional shared objects are 84*31b31015Sliushiwei * loaded via dlopen(). 85*31b31015Sliushiwei */ 86*31b31015Sliushiwei void __utee_tcb_init(void) 87*31b31015Sliushiwei { 88*31b31015Sliushiwei struct dl_phdr_info *dlpi = NULL; 89*31b31015Sliushiwei const Elf_Phdr *phdr = NULL; 90*31b31015Sliushiwei size_t total_size = 0; 91*31b31015Sliushiwei size_t size = 0; 92*31b31015Sliushiwei size_t i = 0; 93*31b31015Sliushiwei size_t j = 0; 94*31b31015Sliushiwei 95*31b31015Sliushiwei /* Compute the size needed for all the TLS blocks */ 96*31b31015Sliushiwei for (i = 0; i < __elf_phdr_info.count; i++) { 97*31b31015Sliushiwei dlpi = __elf_phdr_info.dlpi + i; 98*31b31015Sliushiwei for (j = 0; j < dlpi->dlpi_phnum; j++) { 99*31b31015Sliushiwei phdr = dlpi->dlpi_phdr + j; 100*31b31015Sliushiwei if (phdr->p_type == PT_TLS) { 101*31b31015Sliushiwei total_size += phdr->p_memsz; 102*31b31015Sliushiwei break; 103*31b31015Sliushiwei } 104*31b31015Sliushiwei } 105*31b31015Sliushiwei } 106*31b31015Sliushiwei 107*31b31015Sliushiwei /* ELF modules currently cannot be unmapped */ 108*31b31015Sliushiwei assert(total_size >= _tls_size); 109*31b31015Sliushiwei 110*31b31015Sliushiwei if (total_size == _tls_size) 111*31b31015Sliushiwei return; 112*31b31015Sliushiwei 113*31b31015Sliushiwei /* (Re-)allocate the TCB */ 114*31b31015Sliushiwei _tcb = realloc(_tcb, TCB_SIZE(total_size)); 115*31b31015Sliushiwei if (!_tcb) { 116*31b31015Sliushiwei EMSG("TCB allocation failed (%zu bytes)", TCB_SIZE(total_size)); 117*31b31015Sliushiwei abort(); 118*31b31015Sliushiwei } 119*31b31015Sliushiwei 120*31b31015Sliushiwei /* (Re-)allocate the DTV. + 1 since dtv[0] holds the size */ 121*31b31015Sliushiwei size = DTV_SIZE((__elf_phdr_info.count + 1) * sizeof(union dtv)); 122*31b31015Sliushiwei _tcb->dtv = realloc(_tcb->dtv, size); 123*31b31015Sliushiwei if (!_tcb->dtv) { 124*31b31015Sliushiwei EMSG("DTV allocation failed (%zu bytes)", size); 125*31b31015Sliushiwei abort(); 126*31b31015Sliushiwei } 127*31b31015Sliushiwei 128*31b31015Sliushiwei /* Copy TLS data to the TCB */ 129*31b31015Sliushiwei size = 0; 130*31b31015Sliushiwei for (i = 0; i < __elf_phdr_info.count; i++) { 131*31b31015Sliushiwei dlpi = __elf_phdr_info.dlpi + i; 132*31b31015Sliushiwei for (j = 0; j < dlpi->dlpi_phnum; j++) { 133*31b31015Sliushiwei phdr = dlpi->dlpi_phdr + j; 134*31b31015Sliushiwei if (phdr->p_type != PT_TLS) 135*31b31015Sliushiwei continue; 136*31b31015Sliushiwei if (size + phdr->p_memsz <= _tls_size) { 137*31b31015Sliushiwei /* Already copied */ 138*31b31015Sliushiwei break; 139*31b31015Sliushiwei } 140*31b31015Sliushiwei _tcb->dtv[i + 1].tls = _tcb->tls + size; 141*31b31015Sliushiwei /* Copy .tdata */ 142*31b31015Sliushiwei memcpy(_tcb->tls + size, 143*31b31015Sliushiwei (void *)(dlpi->dlpi_addr + phdr->p_vaddr), 144*31b31015Sliushiwei phdr->p_filesz); 145*31b31015Sliushiwei /* Initialize .tbss */ 146*31b31015Sliushiwei memset(_tcb->tls + size + phdr->p_filesz, 0, 147*31b31015Sliushiwei phdr->p_memsz - phdr->p_filesz); 148*31b31015Sliushiwei size += phdr->p_memsz; 149*31b31015Sliushiwei } 150*31b31015Sliushiwei } 151*31b31015Sliushiwei _tcb->dtv[0].size = i; 152*31b31015Sliushiwei 153*31b31015Sliushiwei _tls_size = total_size; 154*31b31015Sliushiwei #ifdef ARM64 155*31b31015Sliushiwei /* 156*31b31015Sliushiwei * Aarch64 ABI requirement: the thread pointer shall point to the 157*31b31015Sliushiwei * thread's TCB. ARMv7 and Aarch32 access the TCB via _tls_get_addr(). 158*31b31015Sliushiwei */ 159*31b31015Sliushiwei write_tpidr_el0((vaddr_t)_tcb); 160*31b31015Sliushiwei #endif 161*31b31015Sliushiwei } 162*31b31015Sliushiwei 163*31b31015Sliushiwei struct tls_index { 164*31b31015Sliushiwei unsigned long module; 165*31b31015Sliushiwei unsigned long offset; 166*31b31015Sliushiwei }; 167*31b31015Sliushiwei 168*31b31015Sliushiwei void *__tls_get_addr(struct tls_index *ti); 169*31b31015Sliushiwei 170*31b31015Sliushiwei void *__tls_get_addr(struct tls_index *ti) 171*31b31015Sliushiwei { 172*31b31015Sliushiwei return _tcb->dtv[ti->module].tls + ti->offset; 173*31b31015Sliushiwei } 174*31b31015Sliushiwei 175*31b31015Sliushiwei int dl_iterate_phdr(int (*callback)(struct dl_phdr_info *, size_t, void *), 176*31b31015Sliushiwei void *data) 177*31b31015Sliushiwei { 178*31b31015Sliushiwei struct dl_phdr_info *dlpi = NULL; 179*31b31015Sliushiwei size_t id = 0; 180*31b31015Sliushiwei size_t i = 0; 181*31b31015Sliushiwei int st = 0; 182*31b31015Sliushiwei 183*31b31015Sliushiwei /* 184*31b31015Sliushiwei * dlpi_tls_data is thread-specific so if we were to support 185*31b31015Sliushiwei * multi-threading, we would need one copy of struct dl_phdr_info per 186*31b31015Sliushiwei * thread. Could be a pre-allocated area, or could be allocated on the 187*31b31015Sliushiwei * heap. Doing the latter here so that it would at least work if/when we 188*31b31015Sliushiwei * add thread support. Further optimization can always come later. 189*31b31015Sliushiwei */ 190*31b31015Sliushiwei dlpi = calloc(1, sizeof(*dlpi)); 191*31b31015Sliushiwei if (!dlpi) { 192*31b31015Sliushiwei EMSG("dl_phdr_info allocation failed"); 193*31b31015Sliushiwei abort(); 194*31b31015Sliushiwei } 195*31b31015Sliushiwei 196*31b31015Sliushiwei for (i = 0; i < __elf_phdr_info.count; i++) { 197*31b31015Sliushiwei memcpy(dlpi, __elf_phdr_info.dlpi + i, sizeof(*dlpi)); 198*31b31015Sliushiwei dlpi->dlpi_tls_data = NULL; 199*31b31015Sliushiwei id = dlpi->dlpi_tls_modid; 200*31b31015Sliushiwei if (id) 201*31b31015Sliushiwei dlpi->dlpi_tls_data = _tcb->dtv[id].tls; 202*31b31015Sliushiwei st = callback(dlpi, sizeof(*dlpi), data); 203*31b31015Sliushiwei } 204*31b31015Sliushiwei 205*31b31015Sliushiwei free(dlpi); 206*31b31015Sliushiwei return st; 207*31b31015Sliushiwei } 208