131b31015Sliushiwei // SPDX-License-Identifier: BSD-2-Clause
231b31015Sliushiwei /*
331b31015Sliushiwei * Copyright (c) 2020, Huawei Technologies Co., Ltd
431b31015Sliushiwei */
531b31015Sliushiwei /*
631b31015Sliushiwei * Support for Thread-Local Storage (TLS) ABIs for ARMv7/Aarch32 and Aarch64.
731b31015Sliushiwei *
831b31015Sliushiwei * TAs are currently single-threaded, so the only benefit of implementing these
931b31015Sliushiwei * ABIs is to support toolchains that need them even when the target program is
1031b31015Sliushiwei * single-threaded. Such as, the g++ compiler from the GCC toolchain targeting a
1131b31015Sliushiwei * "Posix thread" Linux runtime, which OP-TEE has been using for quite some time
1231b31015Sliushiwei * (arm-linux-gnueabihf-* and aarch64-linux-gnu-*). This allows building C++ TAs
1331b31015Sliushiwei * without having to build a specific toolchain with --disable-threads.
1431b31015Sliushiwei *
1531b31015Sliushiwei * This implementation is based on [1].
1631b31015Sliushiwei *
1731b31015Sliushiwei * - "TLS data structures variant 1" (section 3): the AArch64 compiler uses the
1831b31015Sliushiwei * TPIDR_EL0 to access TLS data directly. This assumes a specific layout for
1931b31015Sliushiwei * the TCB, and (for shared objects) the use of R_AARCH64_TLS_TPREL
2031b31015Sliushiwei * relocations.
2131b31015Sliushiwei * - The "General Dynamic access model" (section 4.1): the ARMv7/Aarch32
2231b31015Sliushiwei * compiler inserts calls to the __tls_get_addr() function which has to be
2331b31015Sliushiwei * implemented by the runtime library. The function takes a module ID and an
2431b31015Sliushiwei * offset parameter, which are provided thanks to R_ARM_TLS_DTPMOD32 and
2531b31015Sliushiwei * R_ARM_TLS_DTPOFF32 relocations.
2631b31015Sliushiwei *
2731b31015Sliushiwei * In addition, dl_iterate_phdr() is implemented here, because it is used by the
2831b31015Sliushiwei * g++ Aarch64 exception handling and it does use the TCB to provide TLS
2931b31015Sliushiwei * information to the caller.
3031b31015Sliushiwei *
3131b31015Sliushiwei * [1] "ELF Handling For Thread-Local Storage"
3231b31015Sliushiwei * https://www.akkadia.org/drepper/tls.pdf
3331b31015Sliushiwei */
3431b31015Sliushiwei
3531b31015Sliushiwei #include <arm64_user_sysreg.h>
3631b31015Sliushiwei #include <assert.h>
3731b31015Sliushiwei #include <link.h>
3831b31015Sliushiwei #include <stdlib.h>
3931b31015Sliushiwei #include <string.h>
4031b31015Sliushiwei #include <sys/queue.h>
4131b31015Sliushiwei #include "user_ta_header.h"
4231b31015Sliushiwei
4331b31015Sliushiwei /* DTV - Dynamic Thread Vector
4431b31015Sliushiwei *
4531b31015Sliushiwei * Maintains an array of pointers to TLS data for each module in the TCB. Each
4631b31015Sliushiwei * module that has a TLS segment has an entry (and consequently, some space in
4731b31015Sliushiwei * the tcb_head::tls buffer). The index is the "module ID".
4831b31015Sliushiwei * dtv[0].size is the number of elements in the vector
4931b31015Sliushiwei * dtv[1].tls points to TLS for the main executable (may be NULL)
5031b31015Sliushiwei * tls[2 .. (size-1)] are for shared libraries
5131b31015Sliushiwei */
5231b31015Sliushiwei union dtv {
5331b31015Sliushiwei unsigned long size;
5431b31015Sliushiwei uint8_t *tls;
5531b31015Sliushiwei };
5631b31015Sliushiwei
5731b31015Sliushiwei #define DTV_SIZE(size) (sizeof(union dtv) + (size))
5831b31015Sliushiwei
5931b31015Sliushiwei /* Thread Control Block */
6031b31015Sliushiwei struct tcb_head {
6131b31015Sliushiwei /* Two words are reserved as per the "TLS variant 1" ABI */
6231b31015Sliushiwei union dtv *dtv;
6331b31015Sliushiwei unsigned long reserved;
6431b31015Sliushiwei /*
6531b31015Sliushiwei * The rest of the structure contains the TLS blocks for each ELF module
6631b31015Sliushiwei * having a PT_TLS segment. Each block is a copy of the .tdata section
6731b31015Sliushiwei * plus some zero-initialized space for .tbss.
6831b31015Sliushiwei */
6931b31015Sliushiwei uint8_t tls[];
7031b31015Sliushiwei };
7131b31015Sliushiwei
7231b31015Sliushiwei /*
7331b31015Sliushiwei * Since TAs are single threaded, only one TCB is needed. This would need to
7431b31015Sliushiwei * change if multi-threading is introduced.
7531b31015Sliushiwei */
7631b31015Sliushiwei static struct tcb_head *_tcb;
7731b31015Sliushiwei static size_t _tls_size;
7831b31015Sliushiwei
7931b31015Sliushiwei #define TCB_SIZE(tls_size) (sizeof(*_tcb) + (tls_size))
8031b31015Sliushiwei
8131b31015Sliushiwei /*
8231b31015Sliushiwei * Initialize or update the TCB.
8331b31015Sliushiwei * Called on application initialization and when additional shared objects are
8431b31015Sliushiwei * loaded via dlopen().
8531b31015Sliushiwei */
__utee_tcb_init(void)8631b31015Sliushiwei void __utee_tcb_init(void)
8731b31015Sliushiwei {
8831b31015Sliushiwei struct dl_phdr_info *dlpi = NULL;
8931b31015Sliushiwei const Elf_Phdr *phdr = NULL;
9031b31015Sliushiwei size_t total_size = 0;
9131b31015Sliushiwei size_t size = 0;
9231b31015Sliushiwei size_t i = 0;
9331b31015Sliushiwei size_t j = 0;
9431b31015Sliushiwei
9531b31015Sliushiwei /* Compute the size needed for all the TLS blocks */
9631b31015Sliushiwei for (i = 0; i < __elf_phdr_info.count; i++) {
9731b31015Sliushiwei dlpi = __elf_phdr_info.dlpi + i;
9831b31015Sliushiwei for (j = 0; j < dlpi->dlpi_phnum; j++) {
9931b31015Sliushiwei phdr = dlpi->dlpi_phdr + j;
10031b31015Sliushiwei if (phdr->p_type == PT_TLS) {
10131b31015Sliushiwei total_size += phdr->p_memsz;
10231b31015Sliushiwei break;
10331b31015Sliushiwei }
10431b31015Sliushiwei }
10531b31015Sliushiwei }
10631b31015Sliushiwei
10731b31015Sliushiwei /* ELF modules currently cannot be unmapped */
10831b31015Sliushiwei assert(total_size >= _tls_size);
10931b31015Sliushiwei
11031b31015Sliushiwei if (total_size == _tls_size)
11131b31015Sliushiwei return;
11231b31015Sliushiwei
11331b31015Sliushiwei /* (Re-)allocate the TCB */
114*34611468SJens Wiklander _tcb = malloc_flags(MAF_ZERO_INIT, _tcb, 1, TCB_SIZE(total_size));
11531b31015Sliushiwei if (!_tcb) {
11631b31015Sliushiwei EMSG("TCB allocation failed (%zu bytes)", TCB_SIZE(total_size));
11731b31015Sliushiwei abort();
11831b31015Sliushiwei }
11931b31015Sliushiwei
12031b31015Sliushiwei /* (Re-)allocate the DTV. + 1 since dtv[0] holds the size */
12131b31015Sliushiwei size = DTV_SIZE((__elf_phdr_info.count + 1) * sizeof(union dtv));
122*34611468SJens Wiklander _tcb->dtv = malloc_flags(MAF_ZERO_INIT, _tcb->dtv, 1, size);
12331b31015Sliushiwei if (!_tcb->dtv) {
12431b31015Sliushiwei EMSG("DTV allocation failed (%zu bytes)", size);
12531b31015Sliushiwei abort();
12631b31015Sliushiwei }
12731b31015Sliushiwei
12831b31015Sliushiwei /* Copy TLS data to the TCB */
12931b31015Sliushiwei size = 0;
13031b31015Sliushiwei for (i = 0; i < __elf_phdr_info.count; i++) {
13131b31015Sliushiwei dlpi = __elf_phdr_info.dlpi + i;
13231b31015Sliushiwei for (j = 0; j < dlpi->dlpi_phnum; j++) {
13331b31015Sliushiwei phdr = dlpi->dlpi_phdr + j;
13431b31015Sliushiwei if (phdr->p_type != PT_TLS)
13531b31015Sliushiwei continue;
13631b31015Sliushiwei if (size + phdr->p_memsz <= _tls_size) {
13731b31015Sliushiwei /* Already copied */
13831b31015Sliushiwei break;
13931b31015Sliushiwei }
14031b31015Sliushiwei _tcb->dtv[i + 1].tls = _tcb->tls + size;
14131b31015Sliushiwei /* Copy .tdata */
14231b31015Sliushiwei memcpy(_tcb->tls + size,
14331b31015Sliushiwei (void *)(dlpi->dlpi_addr + phdr->p_vaddr),
14431b31015Sliushiwei phdr->p_filesz);
14531b31015Sliushiwei /* Initialize .tbss */
14631b31015Sliushiwei memset(_tcb->tls + size + phdr->p_filesz, 0,
14731b31015Sliushiwei phdr->p_memsz - phdr->p_filesz);
14831b31015Sliushiwei size += phdr->p_memsz;
14931b31015Sliushiwei }
15031b31015Sliushiwei }
15131b31015Sliushiwei _tcb->dtv[0].size = i;
15231b31015Sliushiwei
15331b31015Sliushiwei _tls_size = total_size;
15431b31015Sliushiwei #ifdef ARM64
15531b31015Sliushiwei /*
15631b31015Sliushiwei * Aarch64 ABI requirement: the thread pointer shall point to the
15731b31015Sliushiwei * thread's TCB. ARMv7 and Aarch32 access the TCB via _tls_get_addr().
15831b31015Sliushiwei */
15931b31015Sliushiwei write_tpidr_el0((vaddr_t)_tcb);
16031b31015Sliushiwei #endif
16131b31015Sliushiwei }
16231b31015Sliushiwei
16331b31015Sliushiwei struct tls_index {
16431b31015Sliushiwei unsigned long module;
16531b31015Sliushiwei unsigned long offset;
16631b31015Sliushiwei };
16731b31015Sliushiwei
16831b31015Sliushiwei void *__tls_get_addr(struct tls_index *ti);
16931b31015Sliushiwei
__tls_get_addr(struct tls_index * ti)17031b31015Sliushiwei void *__tls_get_addr(struct tls_index *ti)
17131b31015Sliushiwei {
17231b31015Sliushiwei return _tcb->dtv[ti->module].tls + ti->offset;
17331b31015Sliushiwei }
17431b31015Sliushiwei
dl_iterate_phdr(int (* callback)(struct dl_phdr_info *,size_t,void *),void * data)17531b31015Sliushiwei int dl_iterate_phdr(int (*callback)(struct dl_phdr_info *, size_t, void *),
17631b31015Sliushiwei void *data)
17731b31015Sliushiwei {
17831b31015Sliushiwei struct dl_phdr_info *dlpi = NULL;
17931b31015Sliushiwei size_t id = 0;
18031b31015Sliushiwei size_t i = 0;
18131b31015Sliushiwei int st = 0;
18231b31015Sliushiwei
18331b31015Sliushiwei /*
18431b31015Sliushiwei * dlpi_tls_data is thread-specific so if we were to support
18531b31015Sliushiwei * multi-threading, we would need one copy of struct dl_phdr_info per
18631b31015Sliushiwei * thread. Could be a pre-allocated area, or could be allocated on the
18731b31015Sliushiwei * heap. Doing the latter here so that it would at least work if/when we
18831b31015Sliushiwei * add thread support. Further optimization can always come later.
18931b31015Sliushiwei */
19031b31015Sliushiwei dlpi = calloc(1, sizeof(*dlpi));
19131b31015Sliushiwei if (!dlpi) {
19231b31015Sliushiwei EMSG("dl_phdr_info allocation failed");
19331b31015Sliushiwei abort();
19431b31015Sliushiwei }
19531b31015Sliushiwei
19631b31015Sliushiwei for (i = 0; i < __elf_phdr_info.count; i++) {
19731b31015Sliushiwei memcpy(dlpi, __elf_phdr_info.dlpi + i, sizeof(*dlpi));
19831b31015Sliushiwei dlpi->dlpi_tls_data = NULL;
19931b31015Sliushiwei id = dlpi->dlpi_tls_modid;
20031b31015Sliushiwei if (id)
20131b31015Sliushiwei dlpi->dlpi_tls_data = _tcb->dtv[id].tls;
20231b31015Sliushiwei st = callback(dlpi, sizeof(*dlpi), data);
20331b31015Sliushiwei }
20431b31015Sliushiwei
20531b31015Sliushiwei free(dlpi);
20631b31015Sliushiwei return st;
20731b31015Sliushiwei }
208