Coverage for peakipy/io.py: 92%

486 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-09-14 14:49 -0400

1import sys 

2from pathlib import Path 

3from enum import Enum 

4 

5import numpy as np 

6import nmrglue as ng 

7import pandas as pd 

8import textwrap 

9from rich import print 

10from rich.console import Console 

11 

12 

13from bokeh.palettes import Category20 

14from scipy import ndimage 

15from skimage.morphology import square, binary_closing, disk, rectangle 

16from skimage.filters import threshold_otsu 

17from pydantic import BaseModel 

18 

19from peakipy.utils import df_to_rich_table 

20from peakipy.fitting import make_mask 

21 

22console = Console() 

23 

24 

25class StrucEl(str, Enum): 

26 square = "square" 

27 disk = "disk" 

28 rectangle = "rectangle" 

29 mask_method = "mask_method" 

30 

31 

32class PeaklistFormat(str, Enum): 

33 a2 = "a2" 

34 a3 = "a3" 

35 sparky = "sparky" 

36 pipe = "pipe" 

37 peakipy = "peakipy" 

38 

39 

40class OutFmt(str, Enum): 

41 csv = "csv" 

42 pkl = "pkl" 

43 

44 

45class PeaklistColumns(BaseModel): 

46 """These are the columns required for performing fits in peakipy""" 

47 

48 INDEX: int 

49 X_AXIS: int 

50 Y_AXIS: int 

51 X_AXISf: float 

52 Y_AXISf: float 

53 X_PPM: float 

54 Y_PPM: float 

55 XW: float 

56 YW: float 

57 XW_HZ: float 

58 YW_HZ: float 

59 HEIGHT: float 

60 VOL: float 

61 ASS: str 

62 X_RADIUS: float 

63 Y_RADIUS: float 

64 X_RADIUS_PPM: float 

65 Y_RADIUS_PPM: float 

66 include: str 

67 

68 

69class PeaklistColumnsWithClusters(PeaklistColumns): 

70 CLUSTID: int 

71 MEMCNT: int 

72 color: str 

73 

74 

75class Pseudo3D: 

76 """Read dic, data from NMRGlue and dims from input to create a Pseudo3D dataset 

77 

78 :param dic: from nmrglue.pipe.read 

79 :type dic: dict 

80 

81 :param data: data from nmrglue.pipe.read 

82 :type data: numpy.array 

83 

84 :param dims: dimension order i.e [0,1,2] where 0 = planes, 1 = f1, 2 = f2 

85 :type dims: list 

86 """ 

87 

88 def __init__(self, dic, data, dims): 

89 # check dimensions 

90 self._udic = ng.pipe.guess_udic(dic, data) 

91 self._ndim = self._udic["ndim"] 

92 

93 if self._ndim == 1: 

94 err = f"""[red] 

95 ########################################## 

96 NMR Data should be either 2D or 3D 

97 ########################################## 

98 [/red]""" 

99 # raise TypeError(err) 

100 sys.exit(err) 

101 

102 # check that spectrum has correct number of dims 

103 elif self._ndim != len(dims): 

104 err = f"""[red] 

105 ################################################################# 

106 Your spectrum has {self._ndim} dimensions with shape {data.shape} 

107 but you have given a dimension order of {dims}... 

108 ################################################################# 

109 [/red]""" 

110 # raise ValueError(err) 

111 sys.exit(err) 

112 

113 elif (self._ndim == 2) and (len(dims) == 2): 

114 self._f1_dim, self._f2_dim = dims 

115 self._planes = 0 

116 self._uc_f1 = ng.pipe.make_uc(dic, data, dim=self._f1_dim) 

117 self._uc_f2 = ng.pipe.make_uc(dic, data, dim=self._f2_dim) 

118 # make data pseudo3d 

119 self._data = data.reshape((1, data.shape[0], data.shape[1])) 

120 self._dims = [self._planes, self._f1_dim + 1, self._f2_dim + 1] 

121 

122 else: 

123 self._planes, self._f1_dim, self._f2_dim = dims 

124 self._dims = dims 

125 self._data = data 

126 # make unit conversion dicts 

127 self._uc_f2 = ng.pipe.make_uc(dic, data, dim=self._f2_dim) 

128 self._uc_f1 = ng.pipe.make_uc(dic, data, dim=self._f1_dim) 

129 

130 #  rearrange data if dims not in standard order 

131 if self._dims != [0, 1, 2]: 

132 # np.argsort returns indices of array for order 0,1,2 to transpose data correctly 

133 # self._dims = np.argsort(self._dims) 

134 self._data = np.transpose(data, self._dims) 

135 

136 self._dic = dic 

137 

138 self._f1_label = self._udic[self._f1_dim]["label"] 

139 self._f2_label = self._udic[self._f2_dim]["label"] 

140 

141 @property 

142 def uc_f1(self): 

143 """Return unit conversion dict for F1""" 

144 return self._uc_f1 

145 

146 @property 

147 def uc_f2(self): 

148 """Return unit conversion dict for F2""" 

149 return self._uc_f2 

150 

151 @property 

152 def dims(self): 

153 """Return dimension order""" 

154 return self._dims 

155 

156 @property 

157 def data(self): 

158 """Return array containing data""" 

159 return self._data 

160 

161 @data.setter 

162 def data(self, data): 

163 self._data = data 

164 

165 @property 

166 def dic(self): 

167 return self._dic 

168 

169 @property 

170 def udic(self): 

171 return self._udic 

172 

173 @property 

174 def ndim(self): 

175 return self._ndim 

176 

177 @property 

178 def f1_label(self): 

179 # dim label 

180 return self._f1_label 

181 

182 @property 

183 def f2_label(self): 

184 # dim label 

185 return self._f2_label 

186 

187 @property 

188 def planes(self): 

189 return self.dims[0] 

190 

191 @property 

192 def n_planes(self): 

193 return self.data.shape[self.planes] 

194 

195 @property 

196 def f1(self): 

197 return self.dims[1] 

198 

199 @property 

200 def f2(self): 

201 return self.dims[2] 

202 

203 # size of f1 and f2 in points 

204 @property 

205 def f2_size(self): 

206 """Return size of f2 dimension in points""" 

207 return self._udic[self._f2_dim]["size"] 

208 

209 @property 

210 def f1_size(self): 

211 """Return size of f1 dimension in points""" 

212 return self._udic[self._f1_dim]["size"] 

213 

214 # points per ppm 

215 @property 

216 def pt_per_ppm_f1(self): 

217 return self.f1_size / ( 

218 self._udic[self._f1_dim]["sw"] / self._udic[self._f1_dim]["obs"] 

219 ) 

220 

221 @property 

222 def pt_per_ppm_f2(self): 

223 return self.f2_size / ( 

224 self._udic[self._f2_dim]["sw"] / self._udic[self._f2_dim]["obs"] 

225 ) 

226 

227 # points per hz 

228 @property 

229 def pt_per_hz_f1(self): 

230 return self.f1_size / self._udic[self._f1_dim]["sw"] 

231 

232 @property 

233 def pt_per_hz_f2(self): 

234 return self.f2_size / self._udic[self._f2_dim]["sw"] 

235 

236 # hz per point 

237 @property 

238 def hz_per_pt_f1(self): 

239 return 1.0 / self.pt_per_hz_f1 

240 

241 @property 

242 def hz_per_pt_f2(self): 

243 return 1.0 / self.pt_per_hz_f2 

244 

245 # ppm per point 

246 @property 

247 def ppm_per_pt_f1(self): 

248 return 1.0 / self.pt_per_ppm_f1 

249 

250 @property 

251 def ppm_per_pt_f2(self): 

252 return 1.0 / self.pt_per_ppm_f2 

253 

254 # get ppm limits for ppm scales 

255 @property 

256 def f2_ppm_scale(self): 

257 return self.uc_f2.ppm_scale() 

258 

259 @property 

260 def f1_ppm_scale(self): 

261 return self.uc_f1.ppm_scale() 

262 

263 @property 

264 def f2_ppm_limits(self): 

265 return self.uc_f2.ppm_limits() 

266 

267 @property 

268 def f1_ppm_limits(self): 

269 return self.uc_f1.ppm_limits() 

270 

271 @property 

272 def f1_ppm_max(self): 

273 return max(self.f1_ppm_limits) 

274 

275 @property 

276 def f1_ppm_min(self): 

277 return min(self.f1_ppm_limits) 

278 

279 @property 

280 def f2_ppm_max(self): 

281 return max(self.f2_ppm_limits) 

282 

283 @property 

284 def f2_ppm_min(self): 

285 return min(self.f2_ppm_limits) 

286 

287 @property 

288 def f2_ppm_0(self): 

289 return self.f2_ppm_limits[0] 

290 

291 @property 

292 def f2_ppm_1(self): 

293 return self.f2_ppm_limits[1] 

294 

295 @property 

296 def f1_ppm_0(self): 

297 return self.f1_ppm_limits[0] 

298 

299 @property 

300 def f1_ppm_1(self): 

301 return self.f1_ppm_limits[1] 

302 

303 

304class UnknownFormat(Exception): 

305 pass 

306 

307 

308class Peaklist(Pseudo3D): 

309 """Read analysis, sparky or NMRPipe peak list and convert to NMRPipe-ish format also find peak clusters 

310 

311 Parameters 

312 ---------- 

313 path : path-like or str 

314 path to peaklist 

315 data_path : ndarray 

316 NMRPipe format data 

317 fmt : str 

318 a2|a3|sparky|pipe 

319 dims: list 

320 [planes,y,x] 

321 radii: list 

322 [x,y] Mask radii in ppm 

323 

324 

325 Methods 

326 ------- 

327 

328 clusters : 

329 mask_method : 

330 adaptive_clusters : 

331 

332 Returns 

333 ------- 

334 df : pandas DataFrame 

335 dataframe containing peaklist 

336 

337 """ 

338 

339 def __init__( 

340 self, 

341 path, 

342 data_path, 

343 fmt: PeaklistFormat = PeaklistFormat.a2, 

344 dims=[0, 1, 2], 

345 radii=[0.04, 0.4], 

346 posF1="Position F2", 

347 posF2="Position F1", 

348 verbose=False, 

349 ): 

350 dic, data = ng.pipe.read(data_path) 

351 Pseudo3D.__init__(self, dic, data, dims) 

352 self.fmt = fmt 

353 self.peaklist_path = path 

354 self.data_path = data_path 

355 self.verbose = verbose 

356 self._radii = radii 

357 self._thres = None 

358 if self.verbose: 

359 print( 

360 "Points per hz f1 = %.3f, f2 = %.3f" 

361 % (self.pt_per_hz_f1, self.pt_per_hz_f2) 

362 ) 

363 

364 self._analysis_to_pipe_dic = { 

365 "#": "INDEX", 

366 "Position F1": "X_PPM", 

367 "Position F2": "Y_PPM", 

368 "Line Width F1 (Hz)": "XW_HZ", 

369 "Line Width F2 (Hz)": "YW_HZ", 

370 "Height": "HEIGHT", 

371 "Volume": "VOL", 

372 } 

373 self._assign_to_pipe_dic = { 

374 "#": "INDEX", 

375 "Pos F1": "X_PPM", 

376 "Pos F2": "Y_PPM", 

377 "LW F1 (Hz)": "XW_HZ", 

378 "LW F2 (Hz)": "YW_HZ", 

379 "Height": "HEIGHT", 

380 "Volume": "VOL", 

381 } 

382 

383 self._sparky_to_pipe_dic = { 

384 "index": "INDEX", 

385 "w1": "X_PPM", 

386 "w2": "Y_PPM", 

387 "lw1 (hz)": "XW_HZ", 

388 "lw2 (hz)": "YW_HZ", 

389 "Height": "HEIGHT", 

390 "Volume": "VOL", 

391 "Assignment": "ASS", 

392 } 

393 

394 self._analysis_to_pipe_dic[posF1] = "Y_PPM" 

395 self._analysis_to_pipe_dic[posF2] = "X_PPM" 

396 

397 self._df = self.read_peaklist() 

398 

399 def read_peaklist(self): 

400 match self.fmt: 

401 case self.fmt.a2: 

402 self._df = self._read_analysis() 

403 

404 case self.fmt.a3: 

405 self._df = self._read_assign() 

406 

407 case self.fmt.sparky: 

408 self._df = self._read_sparky() 

409 

410 case self.fmt.pipe: 

411 self._df = self._read_pipe() 

412 

413 case _: 

414 raise UnknownFormat("I don't know this format: {self.fmt}") 

415 

416 return self._df 

417 

418 @property 

419 def df(self): 

420 return self._df 

421 

422 @df.setter 

423 def df(self, df): 

424 self._df = df 

425 return self._df 

426 

427 @property 

428 def radii(self): 

429 return self._radii 

430 

431 @property 

432 def f2_radius(self): 

433 """radius for fitting mask in f2""" 

434 return self.radii[0] 

435 

436 @property 

437 def f1_radius(self): 

438 """radius for fitting mask in f1""" 

439 return self.radii[1] 

440 

441 @property 

442 def analysis_to_pipe_dic(self): 

443 return self._analysis_to_pipe_dic 

444 

445 @property 

446 def assign_to_pipe_dic(self): 

447 return self._assign_to_pipe_dic 

448 

449 @property 

450 def sparky_to_pipe_dic(self): 

451 return self._sparky_to_pipe_dic 

452 

453 @property 

454 def thres(self): 

455 if self._thres == None: 

456 self._thres = abs(threshold_otsu(self.data[0])) 

457 return self._thres 

458 else: 

459 return self._thres 

460 

461 def validate_peaklist(self): 

462 self.df = pd.DataFrame( 

463 [ 

464 PeaklistColumns(**i).model_dump() 

465 for i in self.df.to_dict(orient="records") 

466 ] 

467 ) 

468 return self.df 

469 

470 def update_df(self): 

471 # int point value 

472 self.df["X_AXIS"] = self.df.X_PPM.apply(lambda x: self.uc_f2(x, "ppm")) 

473 self.df["Y_AXIS"] = self.df.Y_PPM.apply(lambda x: self.uc_f1(x, "ppm")) 

474 # decimal point value 

475 self.df["X_AXISf"] = self.df.X_PPM.apply(lambda x: self.uc_f2.f(x, "ppm")) 

476 self.df["Y_AXISf"] = self.df.Y_PPM.apply(lambda x: self.uc_f1.f(x, "ppm")) 

477 # in case of missing values (should estimate though) 

478 self.df["XW_HZ"] = self.df.XW_HZ.replace("None", "20.0") 

479 self.df["YW_HZ"] = self.df.YW_HZ.replace("None", "20.0") 

480 self.df["XW_HZ"] = self.df.XW_HZ.replace(np.NaN, "20.0") 

481 self.df["YW_HZ"] = self.df.YW_HZ.replace(np.NaN, "20.0") 

482 # convert linewidths to float 

483 self.df["XW_HZ"] = self.df.XW_HZ.apply(lambda x: float(x)) 

484 self.df["YW_HZ"] = self.df.YW_HZ.apply(lambda x: float(x)) 

485 # convert Hz lw to points 

486 self.df["XW"] = self.df.XW_HZ.apply(lambda x: x * self.pt_per_hz_f2) 

487 self.df["YW"] = self.df.YW_HZ.apply(lambda x: x * self.pt_per_hz_f1) 

488 # makes an assignment column from Assign F1 and Assign F2 columns 

489 # in analysis2.x and ccpnmr v3 assign peak lists 

490 if self.fmt in [PeaklistFormat.a2, PeaklistFormat.a3]: 

491 self.df["ASS"] = self.df.apply( 

492 # lambda i: "".join([i["Assign F1"], i["Assign F2"]]), axis=1 

493 lambda i: f"{i['Assign F1']}_{i['Assign F2']}", 

494 axis=1, 

495 ) 

496 

497 # make default values for X and Y radii for fit masks 

498 self.df["X_RADIUS_PPM"] = np.zeros(len(self.df)) + self.f2_radius 

499 self.df["Y_RADIUS_PPM"] = np.zeros(len(self.df)) + self.f1_radius 

500 self.df["X_RADIUS"] = self.df.X_RADIUS_PPM.apply( 

501 lambda x: x * self.pt_per_ppm_f2 

502 ) 

503 self.df["Y_RADIUS"] = self.df.Y_RADIUS_PPM.apply( 

504 lambda x: x * self.pt_per_ppm_f1 

505 ) 

506 # add include column 

507 if "include" in self.df.columns: 

508 pass 

509 else: 

510 self.df["include"] = self.df.apply(lambda x: "yes", axis=1) 

511 

512 # check assignments for duplicates 

513 self.check_assignments() 

514 # check that peaks are within the bounds of the data 

515 self.check_peak_bounds() 

516 self.validate_peaklist() 

517 

518 def add_fix_bound_columns(self): 

519 """add columns containing parameter bounds (param_upper/param_lower) 

520 and whether or not parameter should be fixed (yes/no) 

521 

522 For parameter bounding: 

523 

524 Column names are <param_name>_upper and <param_name>_lower for upper and lower bounds respectively. 

525 Values are given as floating point. Value of 0.0 indicates that parameter is unbounded 

526 X/Y positions are given in ppm 

527 Linewidths are given in Hz 

528 

529 For parameter fixing: 

530 

531 Column names are <param_name>_fix. 

532 Values are given as a string 'yes' or 'no' 

533 

534 """ 

535 pass 

536 

537 def _read_analysis(self): 

538 df = pd.read_csv(self.peaklist_path, delimiter="\t") 

539 new_columns = [self.analysis_to_pipe_dic.get(i, i) for i in df.columns] 

540 pipe_columns = dict(zip(df.columns, new_columns)) 

541 df = df.rename(index=str, columns=pipe_columns) 

542 

543 return df 

544 

545 def _read_assign(self): 

546 df = pd.read_csv(self.peaklist_path, delimiter="\t") 

547 new_columns = [self.assign_to_pipe_dic.get(i, i) for i in df.columns] 

548 pipe_columns = dict(zip(df.columns, new_columns)) 

549 df = df.rename(index=str, columns=pipe_columns) 

550 

551 return df 

552 

553 def _read_sparky(self): 

554 df = pd.read_csv( 

555 self.peaklist_path, 

556 skiprows=1, 

557 sep=r"\s+", 

558 names=["ASS", "Y_PPM", "X_PPM"], 

559 # use only first three columns 

560 usecols=[i for i in range(3)], 

561 ) 

562 df["INDEX"] = df.index 

563 # need to add LW estimate 

564 df["XW_HZ"] = 20.0 

565 df["YW_HZ"] = 20.0 

566 # dummy values 

567 df["HEIGHT"] = 0.0 

568 df["VOL"] = 0.0 

569 return df 

570 

571 def _read_pipe(self): 

572 to_skip = 0 

573 with open(self.peaklist_path) as f: 

574 lines = f.readlines() 

575 for line in lines: 

576 if line.startswith("VARS"): 

577 columns = line.strip().split()[1:] 

578 elif line[:5].strip(" ").isdigit(): 

579 break 

580 else: 

581 to_skip += 1 

582 df = pd.read_csv( 

583 self.peaklist_path, skiprows=to_skip, names=columns, sep=r"\s+" 

584 ) 

585 return df 

586 

587 def check_assignments(self): 

588 # self.df["ASS"] = self.df. 

589 self.df["ASS"] = self.df.ASS.astype(object) 

590 self.df.loc[self.df["ASS"].isnull(), "ASS"] = "None_dummy_0" 

591 self.df["ASS"] = self.df.ASS.astype(str) 

592 duplicates_bool = self.df.ASS.duplicated() 

593 duplicates = self.df.ASS[duplicates_bool] 

594 if len(duplicates) > 0: 

595 console.print( 

596 textwrap.dedent( 

597 """ 

598 ############################################################################# 

599 You have duplicated assignments in your list... 

600 Currently each peak needs a unique assignment. Sorry about that buddy... 

601 ############################################################################# 

602 """ 

603 ), 

604 style="yellow", 

605 ) 

606 self.df.loc[duplicates_bool, "ASS"] = [ 

607 f"{i}_dummy_{num+1}" for num, i in enumerate(duplicates) 

608 ] 

609 if self.verbose: 

610 print("Here are the duplicates") 

611 print(duplicates) 

612 print(self.df.ASS) 

613 

614 print( 

615 textwrap.dedent( 

616 """ 

617 Creating dummy assignments for duplicates 

618 

619 """ 

620 ) 

621 ) 

622 

623 def check_peak_bounds(self): 

624 columns_to_print = ["INDEX", "ASS", "X_AXIS", "Y_AXIS", "X_PPM", "Y_PPM"] 

625 # check that peaks are within the bounds of spectrum 

626 within_x = (self.df.X_PPM < self.f2_ppm_max) & (self.df.X_PPM > self.f2_ppm_min) 

627 within_y = (self.df.Y_PPM < self.f1_ppm_max) & (self.df.Y_PPM > self.f1_ppm_min) 

628 self.excluded = self.df[~(within_x & within_y)] 

629 self.df = self.df[within_x & within_y] 

630 if len(self.excluded) > 0: 

631 print( 

632 textwrap.dedent( 

633 f"""[red] 

634 ################################################################################# 

635 

636 Excluding the following peaks as they are not within the spectrum which has shape 

637 

638 {self.data.shape} 

639 [/red]""" 

640 ) 

641 ) 

642 table_to_print = df_to_rich_table( 

643 self.excluded, 

644 title="Excluded", 

645 columns=columns_to_print, 

646 styles=["red" for i in columns_to_print], 

647 ) 

648 print(table_to_print) 

649 print( 

650 "[red]#################################################################################[/red]" 

651 ) 

652 

653 def clusters( 

654 self, 

655 thres=None, 

656 struc_el: StrucEl = StrucEl.disk, 

657 struc_size=(3,), 

658 l_struc=None, 

659 ): 

660 """Find clusters of peaks 

661 

662 :param thres: threshold for positive signals above which clusters are selected. If None then threshold_otsu is used 

663 :type thres: float 

664 

665 :param struc_el: 'square'|'disk'|'rectangle' 

666 structuring element for binary_closing of thresholded data can be square, disc or rectangle 

667 :type struc_el: str 

668 

669 :param struc_size: size/dimensions of structuring element 

670 for square and disk first element of tuple is used (for disk value corresponds to radius) 

671 for rectangle, tuple corresponds to (width,height). 

672 :type struc_size: tuple 

673 

674 

675 """ 

676 peaks = [[y, x] for y, x in zip(self.df.Y_AXIS, self.df.X_AXIS)] 

677 

678 if thres == None: 

679 thres = self.thres 

680 self._thres = abs(threshold_otsu(self.data[0])) 

681 else: 

682 self._thres = thres 

683 

684 # get positive and negative 

685 thresh_data = np.bitwise_or( 

686 self.data[0] < (self._thres * -1.0), self.data[0] > self._thres 

687 ) 

688 

689 match struc_el: 

690 case struc_el.disk: 

691 radius = struc_size[0] 

692 if self.verbose: 

693 print(f"using disk with {radius}") 

694 closed_data = binary_closing(thresh_data, disk(int(radius))) 

695 

696 case struc_el.square: 

697 width = struc_size[0] 

698 if self.verbose: 

699 print(f"using square with {width}") 

700 closed_data = binary_closing(thresh_data, square(int(width))) 

701 

702 case struc_el.rectangle: 

703 width, height = struc_size 

704 if self.verbose: 

705 print(f"using rectangle with {width} and {height}") 

706 closed_data = binary_closing( 

707 thresh_data, rectangle(int(width), int(height)) 

708 ) 

709 

710 case _: 

711 if self.verbose: 

712 print(f"Not using any closing function") 

713 closed_data = thresh_data 

714 

715 labeled_array, num_features = ndimage.label(closed_data, l_struc) 

716 

717 self.df.loc[:, "CLUSTID"] = [labeled_array[i[0], i[1]] for i in peaks] 

718 

719 #  renumber "0" clusters 

720 max_clustid = self.df["CLUSTID"].max() 

721 n_of_zeros = len(self.df[self.df["CLUSTID"] == 0]["CLUSTID"]) 

722 self.df.loc[self.df[self.df["CLUSTID"] == 0].index, "CLUSTID"] = np.arange( 

723 max_clustid + 1, n_of_zeros + max_clustid + 1, dtype=int 

724 ) 

725 

726 # count how many peaks per cluster 

727 for ind, group in self.df.groupby("CLUSTID"): 

728 self.df.loc[group.index, "MEMCNT"] = len(group) 

729 

730 self.df.loc[:, "color"] = self.df.apply( 

731 lambda x: Category20[20][int(x.CLUSTID) % 20] if x.MEMCNT > 1 else "black", 

732 axis=1, 

733 ) 

734 return ClustersResult(labeled_array, num_features, closed_data, peaks) 

735 

736 def mask_method(self, overlap=1.0, l_struc=None): 

737 """connect clusters based on overlap of fitting masks 

738 

739 :param overlap: fraction of mask for which overlaps are calculated 

740 :type overlap: float 

741 

742 :returns ClusterResult: Instance of ClusterResult 

743 :rtype: ClustersResult 

744 """ 

745 # overlap is positive 

746 overlap = abs(overlap) 

747 

748 self._thres = threshold_otsu(self.data[0]) 

749 

750 mask = np.zeros(self.data[0].shape, dtype=bool) 

751 

752 for ind, peak in self.df.iterrows(): 

753 mask += make_mask( 

754 self.data[0], 

755 peak.X_AXISf, 

756 peak.Y_AXISf, 

757 peak.X_RADIUS * overlap, 

758 peak.Y_RADIUS * overlap, 

759 ) 

760 

761 peaks = [[y, x] for y, x in zip(self.df.Y_AXIS, self.df.X_AXIS)] 

762 labeled_array, num_features = ndimage.label(mask, l_struc) 

763 

764 self.df.loc[:, "CLUSTID"] = [labeled_array[i[0], i[1]] for i in peaks] 

765 

766 #  renumber "0" clusters 

767 max_clustid = self.df["CLUSTID"].max() 

768 n_of_zeros = len(self.df[self.df["CLUSTID"] == 0]["CLUSTID"]) 

769 self.df.loc[self.df[self.df["CLUSTID"] == 0].index, "CLUSTID"] = np.arange( 

770 max_clustid + 1, n_of_zeros + max_clustid + 1, dtype=int 

771 ) 

772 

773 # count how many peaks per cluster 

774 for ind, group in self.df.groupby("CLUSTID"): 

775 self.df.loc[group.index, "MEMCNT"] = len(group) 

776 

777 self.df.loc[:, "color"] = self.df.apply( 

778 lambda x: Category20[20][int(x.CLUSTID) % 20] if x.MEMCNT > 1 else "black", 

779 axis=1, 

780 ) 

781 

782 return ClustersResult(labeled_array, num_features, mask, peaks) 

783 

784 def to_fuda(self): 

785 fname = self.peaklist_path.parent / "params.fuda" 

786 with open(self.peaklist_path.parent / "peaks.fuda", "w") as peaks_fuda: 

787 for ass, f1_ppm, f2_ppm in zip(self.df.ASS, self.df.Y_PPM, self.df.X_PPM): 

788 peaks_fuda.write(f"{ass}\t{f1_ppm:.3f}\t{f2_ppm:.3f}\n") 

789 groups = self.df.groupby("CLUSTID") 

790 fuda_params = Path(fname) 

791 overlap_peaks = "" 

792 

793 for ind, group in groups: 

794 if len(group) > 1: 

795 overlap_peaks_str = ";".join(group.ASS) 

796 overlap_peaks += f"OVERLAP_PEAKS=({overlap_peaks_str})\n" 

797 

798 fuda_file = textwrap.dedent( 

799 f"""\ 

800 

801# Read peaklist and spectrum info 

802PEAKLIST=peaks.fuda 

803SPECFILE={self.data_path} 

804PARAMETERFILE=(bruker;vclist) 

805ZCORR=ncyc 

806NOISE={self.thres} # you'll need to adjust this 

807BASELINE=N 

808VERBOSELEVEL=5 

809PRINTDATA=Y 

810LM=(MAXFEV=250;TOL=1e-5) 

811#Specify the default values. All values are in ppm: 

812DEF_LINEWIDTH_F1={self.f1_radius} 

813DEF_LINEWIDTH_F2={self.f2_radius} 

814DEF_RADIUS_F1={self.f1_radius} 

815DEF_RADIUS_F2={self.f2_radius} 

816SHAPE=GLORE 

817# OVERLAP PEAKS 

818{overlap_peaks}""" 

819 ) 

820 with open(fuda_params, "w") as f: 

821 print(f"Writing FuDA file {fuda_file}") 

822 f.write(fuda_file) 

823 if self.verbose: 

824 print(overlap_peaks) 

825 

826 

827class ClustersResult: 

828 """Class to store results of clusters function""" 

829 

830 def __init__(self, labeled_array, num_features, closed_data, peaks): 

831 self._labeled_array = labeled_array 

832 self._num_features = num_features 

833 self._closed_data = closed_data 

834 self._peaks = peaks 

835 

836 @property 

837 def labeled_array(self): 

838 return self._labeled_array 

839 

840 @property 

841 def num_features(self): 

842 return self._num_features 

843 

844 @property 

845 def closed_data(self): 

846 return self._closed_data 

847 

848 @property 

849 def peaks(self): 

850 return self._peaks 

851 

852 

853class LoadData(Peaklist): 

854 """Load peaklist data from peakipy .csv file output from either peakipy read or edit 

855 

856 read_peaklist is redefined to just read a .csv file 

857 

858 check_data_frame makes sure data frame is in good shape for setting up fits 

859 

860 """ 

861 

862 def read_peaklist(self): 

863 if self.peaklist_path.suffix == ".csv": 

864 self.df = pd.read_csv(self.peaklist_path) # , comment="#") 

865 

866 elif self.peaklist_path.suffix == ".tab": 

867 self.df = pd.read_csv(self.peaklist_path, sep="\t") # comment="#") 

868 

869 else: 

870 self.df = pd.read_pickle(self.peaklist_path) 

871 

872 self._thres = threshold_otsu(self.data[0]) 

873 

874 return self.df 

875 

876 def validate_peaklist(self): 

877 self.df = pd.DataFrame( 

878 [ 

879 PeaklistColumnsWithClusters(**i).model_dump() 

880 for i in self.df.to_dict(orient="records") 

881 ] 

882 ) 

883 return self.df 

884 

885 def check_data_frame(self): 

886 # make diameter columns 

887 if "X_DIAMETER_PPM" in self.df.columns: 

888 pass 

889 else: 

890 self.df["X_DIAMETER_PPM"] = self.df["X_RADIUS_PPM"] * 2.0 

891 self.df["Y_DIAMETER_PPM"] = self.df["Y_RADIUS_PPM"] * 2.0 

892 

893 #  make a column to track edited peaks 

894 if "Edited" in self.df.columns: 

895 pass 

896 else: 

897 self.df["Edited"] = np.zeros(len(self.df), dtype=bool) 

898 

899 # create include column if it doesn't exist 

900 if "include" in self.df.columns: 

901 pass 

902 else: 

903 self.df["include"] = self.df.apply(lambda _: "yes", axis=1) 

904 

905 # color clusters 

906 self.df["color"] = self.df.apply( 

907 lambda x: Category20[20][int(x.CLUSTID) % 20] if x.MEMCNT > 1 else "black", 

908 axis=1, 

909 ) 

910 

911 # get rid of unnamed columns 

912 unnamed_cols = [i for i in self.df.columns if "Unnamed:" in i] 

913 self.df = self.df.drop(columns=unnamed_cols) 

914 

915 def update_df(self): 

916 """Slightly modified to retain previous configurations""" 

917 # int point value 

918 self.df["X_AXIS"] = self.df.X_PPM.apply(lambda x: self.uc_f2(x, "ppm")) 

919 self.df["Y_AXIS"] = self.df.Y_PPM.apply(lambda x: self.uc_f1(x, "ppm")) 

920 # decimal point value 

921 self.df["X_AXISf"] = self.df.X_PPM.apply(lambda x: self.uc_f2.f(x, "ppm")) 

922 self.df["Y_AXISf"] = self.df.Y_PPM.apply(lambda x: self.uc_f1.f(x, "ppm")) 

923 # in case of missing values (should estimate though) 

924 self.df["XW_HZ"] = self.df.XW_HZ.replace(np.NaN, "20.0") 

925 self.df["YW_HZ"] = self.df.YW_HZ.replace(np.NaN, "20.0") 

926 # convert linewidths to float 

927 self.df["XW_HZ"] = self.df.XW_HZ.apply(lambda x: float(x)) 

928 self.df["YW_HZ"] = self.df.YW_HZ.apply(lambda x: float(x)) 

929 # convert Hz lw to points 

930 self.df["XW"] = self.df.XW_HZ.apply(lambda x: x * self.pt_per_hz_f2) 

931 self.df["YW"] = self.df.YW_HZ.apply(lambda x: x * self.pt_per_hz_f1) 

932 # makes an assignment column 

933 if self.fmt == "a2": 

934 self.df["ASS"] = self.df.apply( 

935 lambda i: "".join([i["Assign F1"], i["Assign F2"]]), axis=1 

936 ) 

937 

938 # make default values for X and Y radii for fit masks 

939 # self.df["X_RADIUS_PPM"] = np.zeros(len(self.df)) + self.f2_radius 

940 # self.df["Y_RADIUS_PPM"] = np.zeros(len(self.df)) + self.f1_radius 

941 self.df["X_RADIUS"] = self.df.X_RADIUS_PPM.apply( 

942 lambda x: x * self.pt_per_ppm_f2 

943 ) 

944 self.df["Y_RADIUS"] = self.df.Y_RADIUS_PPM.apply( 

945 lambda x: x * self.pt_per_ppm_f1 

946 ) 

947 # add include column 

948 if "include" in self.df.columns: 

949 pass 

950 else: 

951 self.df["include"] = self.df.apply(lambda x: "yes", axis=1) 

952 

953 # check assignments for duplicates 

954 self.check_assignments() 

955 # check that peaks are within the bounds of the data 

956 self.check_peak_bounds() 

957 self.validate_peaklist() 

958 

959 

960def get_vclist(vclist, args): 

961 # read vclist 

962 if vclist is None: 

963 vclist = False 

964 elif vclist.exists(): 

965 vclist_data = np.genfromtxt(vclist) 

966 args["vclist_data"] = vclist_data 

967 vclist = True 

968 else: 

969 raise Exception("vclist not found...") 

970 

971 args["vclist"] = vclist 

972 return args