1 // Copyright (C) 2011 Carl Rogers
2 // Released under MIT License
3 // license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php
4
5 #include "cnpy.h"
6
7 #include <stdint.h>
8
9 #include <algorithm>
10 #include <complex>
11 #include <cstdlib>
12 #include <cstring>
13 #include <iomanip>
14 #include <regex>
15 #include <stdexcept>
16
BigEndianTest(int size)17 char cnpy::BigEndianTest(int size)
18 {
19 if (size == 1)
20 return '|';
21 int x = 1;
22 return (((char*)&x)[0]) ? '<' : '>';
23 }
24
map_type(const std::type_info & t)25 char cnpy::map_type(const std::type_info& t)
26 {
27 if (t == typeid(float))
28 return 'f';
29 if (t == typeid(double))
30 return 'f';
31 if (t == typeid(long double))
32 return 'f';
33
34 if (t == typeid(int))
35 return 'i';
36 if (t == typeid(char))
37 return 'i';
38 if (t == typeid(signed char))
39 return 'i';
40 if (t == typeid(short))
41 return 'i';
42 if (t == typeid(long))
43 return 'i';
44 if (t == typeid(long long))
45 return 'i';
46
47 if (t == typeid(unsigned char))
48 return 'u';
49 if (t == typeid(unsigned short))
50 return 'u';
51 if (t == typeid(unsigned long))
52 return 'u';
53 if (t == typeid(unsigned long long))
54 return 'u';
55 if (t == typeid(unsigned int))
56 return 'u';
57
58 if (t == typeid(bool))
59 return 'b';
60
61 if (t == typeid(std::complex<float>))
62 return 'c';
63 if (t == typeid(std::complex<double>))
64 return 'c';
65 if (t == typeid(std::complex<long double>))
66 return 'c';
67
68 else
69 return '?';
70 }
71
72 template <>
operator +=(std::vector<char> & lhs,const std::string rhs)73 std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const std::string rhs)
74 {
75 lhs.insert(lhs.end(), rhs.begin(), rhs.end());
76 return lhs;
77 }
78
79 template <>
operator +=(std::vector<char> & lhs,const char * rhs)80 std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const char* rhs)
81 {
82 // write in little endian
83 size_t len = strlen(rhs);
84 lhs.reserve(len);
85 for (size_t byte = 0; byte < len; byte++) {
86 lhs.push_back(rhs[byte]);
87 }
88 return lhs;
89 }
90
parse_npy_header(unsigned char * buffer,size_t & word_size,std::vector<size_t> & shape,bool & fortran_order,std::string & typeName)91 void cnpy::parse_npy_header(unsigned char* buffer, size_t& word_size, std::vector<size_t>& shape, bool& fortran_order,
92 std::string& typeName)
93 {
94 // std::string magic_string(buffer,6);
95 uint8_t major_version = *reinterpret_cast<uint8_t*>(buffer + 6);
96 uint8_t minor_version = *reinterpret_cast<uint8_t*>(buffer + 7);
97 uint16_t header_len = *reinterpret_cast<uint16_t*>(buffer + 8);
98 std::string header(reinterpret_cast<char*>(buffer + 9), header_len);
99
100 size_t loc1, loc2;
101
102 // fortran order
103 loc1 = header.find("fortran_order") + 16;
104 fortran_order = (header.substr(loc1, 4) == "True" ? true : false);
105 if (fortran_order)
106 throw std::runtime_error("npy input file: 'fortran_order' must be false, use: arr2 = np.ascontiguousarray(arr1)");
107
108 // shape
109 loc1 = header.find("(");
110 loc2 = header.find(")");
111
112 std::regex num_regex("[0-9][0-9]*");
113 std::smatch sm;
114 shape.clear();
115
116 std::string str_shape = header.substr(loc1 + 1, loc2 - loc1 - 1);
117 while (std::regex_search(str_shape, sm, num_regex)) {
118 shape.push_back(std::stoi(sm[0].str()));
119 str_shape = sm.suffix().str();
120 }
121
122 // endian, word size, data type
123 // byte order code | stands for not applicable.
124 // not sure when this applies except for byte array
125 loc1 = header.find("descr") + 9;
126 bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
127 assert(littleEndian);
128
129 // char type = header[loc1+1];
130 // assert(type == map_type(T));
131
132 std::string str_ws = header.substr(loc1 + 2);
133 loc2 = str_ws.find("'");
134 word_size = atoi(str_ws.substr(0, loc2).c_str());
135 if (header.substr(loc1 + 1, 1) == "i") {
136 typeName = "int";
137 } else if (header.substr(loc1 + 1, 1) == "u") {
138 typeName = "uint";
139 } else if (header.substr(loc1 + 1, 1) == "f") {
140 typeName = "float";
141 }
142 typeName = typeName + std::to_string(word_size * 8);
143 }
144
parse_npy_header(FILE * fp,size_t & word_size,std::vector<size_t> & shape,bool & fortran_order,std::string & typeName)145 void cnpy::parse_npy_header(FILE* fp, size_t& word_size, std::vector<size_t>& shape, bool& fortran_order,
146 std::string& typeName)
147 {
148 char buffer[256];
149 size_t res = fread(buffer, sizeof(char), 11, fp);
150 if (res != 11)
151 throw std::runtime_error("parse_npy_header: failed fread");
152 std::string header = fgets(buffer, 256, fp);
153 assert(header[header.size() - 1] == '\n');
154
155 size_t loc1, loc2;
156
157 // fortran order
158 loc1 = header.find("fortran_order");
159 if (loc1 == std::string::npos)
160 throw std::runtime_error("parse_npy_header: failed to find header keyword: 'fortran_order'");
161 loc1 += 16;
162 fortran_order = (header.substr(loc1, 4) == "True" ? true : false);
163 if (fortran_order)
164 throw std::runtime_error("npy input file: 'fortran_order' must be false, use: arr2 = np.ascontiguousarray(arr1)");
165
166 // shape
167 loc1 = header.find("(");
168 loc2 = header.find(")");
169 if (loc1 == std::string::npos || loc2 == std::string::npos)
170 throw std::runtime_error("parse_npy_header: failed to find header keyword: '(' or ')'");
171
172 std::regex num_regex("[0-9][0-9]*");
173 std::smatch sm;
174 shape.clear();
175
176 std::string str_shape = header.substr(loc1 + 1, loc2 - loc1 - 1);
177 while (std::regex_search(str_shape, sm, num_regex)) {
178 shape.push_back(std::stoi(sm[0].str()));
179 str_shape = sm.suffix().str();
180 }
181
182 // endian, word size, data type
183 // byte order code | stands for not applicable.
184 // not sure when this applies except for byte array
185 loc1 = header.find("descr");
186 if (loc1 == std::string::npos)
187 throw std::runtime_error("parse_npy_header: failed to find header keyword: 'descr'");
188 loc1 += 9;
189 bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
190 assert(littleEndian);
191
192 // char type = header[loc1+1];
193 // assert(type == map_type(T));
194
195 std::string str_ws = header.substr(loc1 + 2);
196 loc2 = str_ws.find("'");
197 word_size = atoi(str_ws.substr(0, loc2).c_str());
198 if (header.substr(loc1 + 1, 1) == "i") {
199 typeName = "int";
200 } else if (header.substr(loc1 + 1, 1) == "u") {
201 typeName = "uint";
202 } else if (header.substr(loc1 + 1, 1) == "f") {
203 typeName = "float";
204 }
205 typeName = typeName + std::to_string(word_size * 8);
206 }
207
parse_zip_footer(FILE * fp,uint16_t & nrecs,size_t & global_header_size,size_t & global_header_offset)208 void cnpy::parse_zip_footer(FILE* fp, uint16_t& nrecs, size_t& global_header_size, size_t& global_header_offset)
209 {
210 std::vector<char> footer(22);
211 fseek(fp, -22, SEEK_END);
212 size_t res = fread(&footer[0], sizeof(char), 22, fp);
213 if (res != 22)
214 throw std::runtime_error("parse_zip_footer: failed fread");
215
216 uint16_t disk_no, disk_start, nrecs_on_disk, comment_len;
217 disk_no = *(uint16_t*)&footer[4];
218 disk_start = *(uint16_t*)&footer[6];
219 nrecs_on_disk = *(uint16_t*)&footer[8];
220 nrecs = *(uint16_t*)&footer[10];
221 global_header_size = *(uint32_t*)&footer[12];
222 global_header_offset = *(uint32_t*)&footer[16];
223 comment_len = *(uint16_t*)&footer[20];
224
225 assert(disk_no == 0);
226 assert(disk_start == 0);
227 assert(nrecs_on_disk == nrecs);
228 assert(comment_len == 0);
229 }
230
load_the_npy_file(FILE * fp)231 cnpy::NpyArray load_the_npy_file(FILE* fp)
232 {
233 std::vector<size_t> shape;
234 size_t word_size;
235 std::string typeName;
236 bool fortran_order;
237 cnpy::parse_npy_header(fp, word_size, shape, fortran_order, typeName);
238
239 cnpy::NpyArray arr(shape, word_size, fortran_order, typeName);
240 size_t nread = fread(arr.data<char>(), 1, arr.num_bytes(), fp);
241 if (nread != arr.num_bytes())
242 throw std::runtime_error("load_the_npy_file: failed fread");
243 return arr;
244 }
245
load_the_npz_array(FILE * fp,uint32_t compr_bytes,uint32_t uncompr_bytes)246 cnpy::NpyArray load_the_npz_array(FILE* fp, uint32_t compr_bytes, uint32_t uncompr_bytes)
247 {
248 std::vector<unsigned char> buffer_compr(compr_bytes);
249 std::vector<unsigned char> buffer_uncompr(uncompr_bytes);
250 size_t nread = fread(&buffer_compr[0], 1, compr_bytes, fp);
251 if (nread != compr_bytes)
252 throw std::runtime_error("load_the_npy_file: failed fread");
253
254 #if 0
255 int err;
256 z_stream d_stream;
257
258 d_stream.zalloc = Z_NULL;
259 d_stream.zfree = Z_NULL;
260 d_stream.opaque = Z_NULL;
261 d_stream.avail_in = 0;
262 d_stream.next_in = Z_NULL;
263 err = inflateInit2(&d_stream, -MAX_WBITS);
264
265 d_stream.avail_in = compr_bytes;
266 d_stream.next_in = &buffer_compr[0];
267 d_stream.avail_out = uncompr_bytes;
268 d_stream.next_out = &buffer_uncompr[0];
269
270 err = inflate(&d_stream, Z_FINISH);
271 err = inflateEnd(&d_stream);
272 #endif
273
274 std::vector<size_t> shape;
275 size_t word_size;
276 bool fortran_order;
277 std::string typeName;
278 cnpy::parse_npy_header(&buffer_uncompr[0], word_size, shape, fortran_order, typeName);
279
280 cnpy::NpyArray array(shape, word_size, fortran_order, typeName);
281
282 size_t offset = uncompr_bytes - array.num_bytes();
283 memcpy(array.data<unsigned char>(), &buffer_uncompr[0] + offset, array.num_bytes());
284
285 return array;
286 }
287
npz_load(std::string fname)288 cnpy::npz_t cnpy::npz_load(std::string fname)
289 {
290 FILE* fp = fopen(fname.c_str(), "rb");
291
292 if (!fp) {
293 throw std::runtime_error("npz_load: Error! Unable to open file " + fname + "!");
294 }
295
296 cnpy::npz_t arrays;
297
298 while (1) {
299 std::vector<char> local_header(30);
300 size_t headerres = fread(&local_header[0], sizeof(char), 30, fp);
301 if (headerres != 30)
302 throw std::runtime_error("npz_load: failed fread");
303
304 // if we've reached the global header, stop reading
305 if (local_header[2] != 0x03 || local_header[3] != 0x04)
306 break;
307
308 // read in the variable name
309 uint16_t name_len = *(uint16_t*)&local_header[26];
310 std::string varname(name_len, ' ');
311 size_t vname_res = fread(&varname[0], sizeof(char), name_len, fp);
312 if (vname_res != name_len)
313 throw std::runtime_error("npz_load: failed fread");
314
315 // erase the lagging .npy
316 varname.erase(varname.end() - 4, varname.end());
317
318 // read in the extra field
319 uint16_t extra_field_len = *(uint16_t*)&local_header[28];
320 if (extra_field_len > 0) {
321 std::vector<char> buff(extra_field_len);
322 size_t efield_res = fread(&buff[0], sizeof(char), extra_field_len, fp);
323 if (efield_res != extra_field_len)
324 throw std::runtime_error("npz_load: failed fread");
325 }
326
327 uint16_t compr_method = *reinterpret_cast<uint16_t*>(&local_header[0] + 8);
328 uint32_t compr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0] + 18);
329 uint32_t uncompr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0] + 22);
330
331 if (compr_method == 0) {
332 arrays[varname] = load_the_npy_file(fp);
333 } else {
334 arrays[varname] = load_the_npz_array(fp, compr_bytes, uncompr_bytes);
335 }
336 }
337
338 fclose(fp);
339 return arrays;
340 }
341
npz_load(std::string fname,std::string varname)342 cnpy::NpyArray cnpy::npz_load(std::string fname, std::string varname)
343 {
344 FILE* fp = fopen(fname.c_str(), "rb");
345
346 if (!fp)
347 throw std::runtime_error("npz_load: Unable to open file " + fname);
348
349 while (1) {
350 std::vector<char> local_header(30);
351 size_t header_res = fread(&local_header[0], sizeof(char), 30, fp);
352 if (header_res != 30)
353 throw std::runtime_error("npz_load: failed fread");
354
355 // if we've reached the global header, stop reading
356 if (local_header[2] != 0x03 || local_header[3] != 0x04)
357 break;
358
359 // read in the variable name
360 uint16_t name_len = *(uint16_t*)&local_header[26];
361 std::string vname(name_len, ' ');
362 size_t vname_res = fread(&vname[0], sizeof(char), name_len, fp);
363 if (vname_res != name_len)
364 throw std::runtime_error("npz_load: failed fread");
365 vname.erase(vname.end() - 4, vname.end()); // erase the lagging .npy
366
367 // read in the extra field
368 uint16_t extra_field_len = *(uint16_t*)&local_header[28];
369 fseek(fp, extra_field_len, SEEK_CUR); // skip past the extra field
370
371 uint16_t compr_method = *reinterpret_cast<uint16_t*>(&local_header[0] + 8);
372 uint32_t compr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0] + 18);
373 uint32_t uncompr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0] + 22);
374
375 if (vname == varname) {
376 NpyArray array = (compr_method == 0) ? load_the_npy_file(fp) : load_the_npz_array(fp, compr_bytes, uncompr_bytes);
377 fclose(fp);
378 return array;
379 } else {
380 // skip past the data
381 uint32_t size = *(uint32_t*)&local_header[22];
382 fseek(fp, size, SEEK_CUR);
383 }
384 }
385
386 fclose(fp);
387
388 // if we get here, we haven't found the variable in the file
389 throw std::runtime_error("npz_load: Variable name " + varname + " not found in " + fname);
390 }
391
npy_load(std::string fname)392 cnpy::NpyArray cnpy::npy_load(std::string fname)
393 {
394 FILE* fp = fopen(fname.c_str(), "rb");
395
396 if (!fp)
397 throw std::runtime_error("npy_load: Unable to open file " + fname);
398
399 NpyArray arr = load_the_npy_file(fp);
400
401 fclose(fp);
402 return arr;
403 }
404