dgbtrf.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. /* dgbtrf.f -- translated by f2c (version 20061008).
  2. You must link the resulting object file with libf2c:
  3. on Microsoft Windows system, link with libf2c.lib;
  4. on Linux or Unix systems, link with .../path/to/libf2c.a -lm
  5. or, if you install libf2c.a in a standard place, with -lf2c -lm
  6. -- in that order, at the end of the command line, as in
  7. cc *.o -lf2c -lm
  8. Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,
  9. http://www.netlib.org/f2c/libf2c.zip
  10. */
  11. #include "f2c.h"
  12. #include "blaswrap.h"
  13. /* Table of constant values */
  14. static integer c__1 = 1;
  15. static integer c__65 = 65;
  16. static doublereal c_b18 = -1.;
  17. static doublereal c_b31 = 1.;
  18. /* Subroutine */ int _starpu_dgbtrf_(integer *m, integer *n, integer *kl, integer *ku,
  19. doublereal *ab, integer *ldab, integer *ipiv, integer *info)
  20. {
  21. /* System generated locals */
  22. integer ab_dim1, ab_offset, i__1, i__2, i__3, i__4, i__5, i__6;
  23. doublereal d__1;
  24. /* Local variables */
  25. integer i__, j, i2, i3, j2, j3, k2, jb, nb, ii, jj, jm, ip, jp, km, ju,
  26. kv, nw;
  27. extern /* Subroutine */ int _starpu_dger_(integer *, integer *, doublereal *,
  28. doublereal *, integer *, doublereal *, integer *, doublereal *,
  29. integer *);
  30. doublereal temp;
  31. extern /* Subroutine */ int _starpu_dscal_(integer *, doublereal *, doublereal *,
  32. integer *), _starpu_dgemm_(char *, char *, integer *, integer *, integer *
  33. , doublereal *, doublereal *, integer *, doublereal *, integer *,
  34. doublereal *, doublereal *, integer *), _starpu_dcopy_(
  35. integer *, doublereal *, integer *, doublereal *, integer *),
  36. _starpu_dswap_(integer *, doublereal *, integer *, doublereal *, integer *
  37. );
  38. doublereal work13[4160] /* was [65][64] */, work31[4160] /*
  39. was [65][64] */;
  40. extern /* Subroutine */ int _starpu_dtrsm_(char *, char *, char *, char *,
  41. integer *, integer *, doublereal *, doublereal *, integer *,
  42. doublereal *, integer *), _starpu_dgbtf2_(
  43. integer *, integer *, integer *, integer *, doublereal *, integer
  44. *, integer *, integer *);
  45. extern integer _starpu_idamax_(integer *, doublereal *, integer *);
  46. extern /* Subroutine */ int _starpu_xerbla_(char *, integer *);
  47. extern integer _starpu_ilaenv_(integer *, char *, char *, integer *, integer *,
  48. integer *, integer *);
  49. extern /* Subroutine */ int _starpu_dlaswp_(integer *, doublereal *, integer *,
  50. integer *, integer *, integer *, integer *);
  51. /* -- LAPACK routine (version 3.2) -- */
  52. /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
  53. /* November 2006 */
  54. /* .. Scalar Arguments .. */
  55. /* .. */
  56. /* .. Array Arguments .. */
  57. /* .. */
  58. /* Purpose */
  59. /* ======= */
  60. /* DGBTRF computes an LU factorization of a real m-by-n band matrix A */
  61. /* using partial pivoting with row interchanges. */
  62. /* This is the blocked version of the algorithm, calling Level 3 BLAS. */
  63. /* Arguments */
  64. /* ========= */
  65. /* M (input) INTEGER */
  66. /* The number of rows of the matrix A. M >= 0. */
  67. /* N (input) INTEGER */
  68. /* The number of columns of the matrix A. N >= 0. */
  69. /* KL (input) INTEGER */
  70. /* The number of subdiagonals within the band of A. KL >= 0. */
  71. /* KU (input) INTEGER */
  72. /* The number of superdiagonals within the band of A. KU >= 0. */
  73. /* AB (input/output) DOUBLE PRECISION array, dimension (LDAB,N) */
  74. /* On entry, the matrix A in band storage, in rows KL+1 to */
  75. /* 2*KL+KU+1; rows 1 to KL of the array need not be set. */
  76. /* The j-th column of A is stored in the j-th column of the */
  77. /* array AB as follows: */
  78. /* AB(kl+ku+1+i-j,j) = A(i,j) for max(1,j-ku)<=i<=min(m,j+kl) */
  79. /* On exit, details of the factorization: U is stored as an */
  80. /* upper triangular band matrix with KL+KU superdiagonals in */
  81. /* rows 1 to KL+KU+1, and the multipliers used during the */
  82. /* factorization are stored in rows KL+KU+2 to 2*KL+KU+1. */
  83. /* See below for further details. */
  84. /* LDAB (input) INTEGER */
  85. /* The leading dimension of the array AB. LDAB >= 2*KL+KU+1. */
  86. /* IPIV (output) INTEGER array, dimension (min(M,N)) */
  87. /* The pivot indices; for 1 <= i <= min(M,N), row i of the */
  88. /* matrix was interchanged with row IPIV(i). */
  89. /* INFO (output) INTEGER */
  90. /* = 0: successful exit */
  91. /* < 0: if INFO = -i, the i-th argument had an illegal value */
  92. /* > 0: if INFO = +i, U(i,i) is exactly zero. The factorization */
  93. /* has been completed, but the factor U is exactly */
  94. /* singular, and division by zero will occur if it is used */
  95. /* to solve a system of equations. */
  96. /* Further Details */
  97. /* =============== */
  98. /* The band storage scheme is illustrated by the following example, when */
  99. /* M = N = 6, KL = 2, KU = 1: */
  100. /* On entry: On exit: */
  101. /* * * * + + + * * * u14 u25 u36 */
  102. /* * * + + + + * * u13 u24 u35 u46 */
  103. /* * a12 a23 a34 a45 a56 * u12 u23 u34 u45 u56 */
  104. /* a11 a22 a33 a44 a55 a66 u11 u22 u33 u44 u55 u66 */
  105. /* a21 a32 a43 a54 a65 * m21 m32 m43 m54 m65 * */
  106. /* a31 a42 a53 a64 * * m31 m42 m53 m64 * * */
  107. /* Array elements marked * are not used by the routine; elements marked */
  108. /* + need not be set on entry, but are required by the routine to store */
  109. /* elements of U because of fill-in resulting from the row interchanges. */
  110. /* ===================================================================== */
  111. /* .. Parameters .. */
  112. /* .. */
  113. /* .. Local Scalars .. */
  114. /* .. */
  115. /* .. Local Arrays .. */
  116. /* .. */
  117. /* .. External Functions .. */
  118. /* .. */
  119. /* .. External Subroutines .. */
  120. /* .. */
  121. /* .. Intrinsic Functions .. */
  122. /* .. */
  123. /* .. Executable Statements .. */
  124. /* KV is the number of superdiagonals in the factor U, allowing for */
  125. /* fill-in */
  126. /* Parameter adjustments */
  127. ab_dim1 = *ldab;
  128. ab_offset = 1 + ab_dim1;
  129. ab -= ab_offset;
  130. --ipiv;
  131. /* Function Body */
  132. kv = *ku + *kl;
  133. /* Test the input parameters. */
  134. *info = 0;
  135. if (*m < 0) {
  136. *info = -1;
  137. } else if (*n < 0) {
  138. *info = -2;
  139. } else if (*kl < 0) {
  140. *info = -3;
  141. } else if (*ku < 0) {
  142. *info = -4;
  143. } else if (*ldab < *kl + kv + 1) {
  144. *info = -6;
  145. }
  146. if (*info != 0) {
  147. i__1 = -(*info);
  148. _starpu_xerbla_("DGBTRF", &i__1);
  149. return 0;
  150. }
  151. /* Quick return if possible */
  152. if (*m == 0 || *n == 0) {
  153. return 0;
  154. }
  155. /* Determine the block size for this environment */
  156. nb = _starpu_ilaenv_(&c__1, "DGBTRF", " ", m, n, kl, ku);
  157. /* The block size must not exceed the limit set by the size of the */
  158. /* local arrays WORK13 and WORK31. */
  159. nb = min(nb,64);
  160. if (nb <= 1 || nb > *kl) {
  161. /* Use unblocked code */
  162. _starpu_dgbtf2_(m, n, kl, ku, &ab[ab_offset], ldab, &ipiv[1], info);
  163. } else {
  164. /* Use blocked code */
  165. /* Zero the superdiagonal elements of the work array WORK13 */
  166. i__1 = nb;
  167. for (j = 1; j <= i__1; ++j) {
  168. i__2 = j - 1;
  169. for (i__ = 1; i__ <= i__2; ++i__) {
  170. work13[i__ + j * 65 - 66] = 0.;
  171. /* L10: */
  172. }
  173. /* L20: */
  174. }
  175. /* Zero the subdiagonal elements of the work array WORK31 */
  176. i__1 = nb;
  177. for (j = 1; j <= i__1; ++j) {
  178. i__2 = nb;
  179. for (i__ = j + 1; i__ <= i__2; ++i__) {
  180. work31[i__ + j * 65 - 66] = 0.;
  181. /* L30: */
  182. }
  183. /* L40: */
  184. }
  185. /* Gaussian elimination with partial pivoting */
  186. /* Set fill-in elements in columns KU+2 to KV to zero */
  187. i__1 = min(kv,*n);
  188. for (j = *ku + 2; j <= i__1; ++j) {
  189. i__2 = *kl;
  190. for (i__ = kv - j + 2; i__ <= i__2; ++i__) {
  191. ab[i__ + j * ab_dim1] = 0.;
  192. /* L50: */
  193. }
  194. /* L60: */
  195. }
  196. /* JU is the index of the last column affected by the current */
  197. /* stage of the factorization */
  198. ju = 1;
  199. i__1 = min(*m,*n);
  200. i__2 = nb;
  201. for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) {
  202. /* Computing MIN */
  203. i__3 = nb, i__4 = min(*m,*n) - j + 1;
  204. jb = min(i__3,i__4);
  205. /* The active part of the matrix is partitioned */
  206. /* A11 A12 A13 */
  207. /* A21 A22 A23 */
  208. /* A31 A32 A33 */
  209. /* Here A11, A21 and A31 denote the current block of JB columns */
  210. /* which is about to be factorized. The number of rows in the */
  211. /* partitioning are JB, I2, I3 respectively, and the numbers */
  212. /* of columns are JB, J2, J3. The superdiagonal elements of A13 */
  213. /* and the subdiagonal elements of A31 lie outside the band. */
  214. /* Computing MIN */
  215. i__3 = *kl - jb, i__4 = *m - j - jb + 1;
  216. i2 = min(i__3,i__4);
  217. /* Computing MIN */
  218. i__3 = jb, i__4 = *m - j - *kl + 1;
  219. i3 = min(i__3,i__4);
  220. /* J2 and J3 are computed after JU has been updated. */
  221. /* Factorize the current block of JB columns */
  222. i__3 = j + jb - 1;
  223. for (jj = j; jj <= i__3; ++jj) {
  224. /* Set fill-in elements in column JJ+KV to zero */
  225. if (jj + kv <= *n) {
  226. i__4 = *kl;
  227. for (i__ = 1; i__ <= i__4; ++i__) {
  228. ab[i__ + (jj + kv) * ab_dim1] = 0.;
  229. /* L70: */
  230. }
  231. }
  232. /* Find pivot and test for singularity. KM is the number of */
  233. /* subdiagonal elements in the current column. */
  234. /* Computing MIN */
  235. i__4 = *kl, i__5 = *m - jj;
  236. km = min(i__4,i__5);
  237. i__4 = km + 1;
  238. jp = _starpu_idamax_(&i__4, &ab[kv + 1 + jj * ab_dim1], &c__1);
  239. ipiv[jj] = jp + jj - j;
  240. if (ab[kv + jp + jj * ab_dim1] != 0.) {
  241. /* Computing MAX */
  242. /* Computing MIN */
  243. i__6 = jj + *ku + jp - 1;
  244. i__4 = ju, i__5 = min(i__6,*n);
  245. ju = max(i__4,i__5);
  246. if (jp != 1) {
  247. /* Apply interchange to columns J to J+JB-1 */
  248. if (jp + jj - 1 < j + *kl) {
  249. i__4 = *ldab - 1;
  250. i__5 = *ldab - 1;
  251. _starpu_dswap_(&jb, &ab[kv + 1 + jj - j + j * ab_dim1], &
  252. i__4, &ab[kv + jp + jj - j + j * ab_dim1],
  253. &i__5);
  254. } else {
  255. /* The interchange affects columns J to JJ-1 of A31 */
  256. /* which are stored in the work array WORK31 */
  257. i__4 = jj - j;
  258. i__5 = *ldab - 1;
  259. _starpu_dswap_(&i__4, &ab[kv + 1 + jj - j + j * ab_dim1],
  260. &i__5, &work31[jp + jj - j - *kl - 1], &
  261. c__65);
  262. i__4 = j + jb - jj;
  263. i__5 = *ldab - 1;
  264. i__6 = *ldab - 1;
  265. _starpu_dswap_(&i__4, &ab[kv + 1 + jj * ab_dim1], &i__5, &
  266. ab[kv + jp + jj * ab_dim1], &i__6);
  267. }
  268. }
  269. /* Compute multipliers */
  270. d__1 = 1. / ab[kv + 1 + jj * ab_dim1];
  271. _starpu_dscal_(&km, &d__1, &ab[kv + 2 + jj * ab_dim1], &c__1);
  272. /* Update trailing submatrix within the band and within */
  273. /* the current block. JM is the index of the last column */
  274. /* which needs to be updated. */
  275. /* Computing MIN */
  276. i__4 = ju, i__5 = j + jb - 1;
  277. jm = min(i__4,i__5);
  278. if (jm > jj) {
  279. i__4 = jm - jj;
  280. i__5 = *ldab - 1;
  281. i__6 = *ldab - 1;
  282. _starpu_dger_(&km, &i__4, &c_b18, &ab[kv + 2 + jj * ab_dim1],
  283. &c__1, &ab[kv + (jj + 1) * ab_dim1], &i__5, &
  284. ab[kv + 1 + (jj + 1) * ab_dim1], &i__6);
  285. }
  286. } else {
  287. /* If pivot is zero, set INFO to the index of the pivot */
  288. /* unless a zero pivot has already been found. */
  289. if (*info == 0) {
  290. *info = jj;
  291. }
  292. }
  293. /* Copy current column of A31 into the work array WORK31 */
  294. /* Computing MIN */
  295. i__4 = jj - j + 1;
  296. nw = min(i__4,i3);
  297. if (nw > 0) {
  298. _starpu_dcopy_(&nw, &ab[kv + *kl + 1 - jj + j + jj * ab_dim1], &
  299. c__1, &work31[(jj - j + 1) * 65 - 65], &c__1);
  300. }
  301. /* L80: */
  302. }
  303. if (j + jb <= *n) {
  304. /* Apply the row interchanges to the other blocks. */
  305. /* Computing MIN */
  306. i__3 = ju - j + 1;
  307. j2 = min(i__3,kv) - jb;
  308. /* Computing MAX */
  309. i__3 = 0, i__4 = ju - j - kv + 1;
  310. j3 = max(i__3,i__4);
  311. /* Use DLASWP to apply the row interchanges to A12, A22, and */
  312. /* A32. */
  313. i__3 = *ldab - 1;
  314. _starpu_dlaswp_(&j2, &ab[kv + 1 - jb + (j + jb) * ab_dim1], &i__3, &
  315. c__1, &jb, &ipiv[j], &c__1);
  316. /* Adjust the pivot indices. */
  317. i__3 = j + jb - 1;
  318. for (i__ = j; i__ <= i__3; ++i__) {
  319. ipiv[i__] = ipiv[i__] + j - 1;
  320. /* L90: */
  321. }
  322. /* Apply the row interchanges to A13, A23, and A33 */
  323. /* columnwise. */
  324. k2 = j - 1 + jb + j2;
  325. i__3 = j3;
  326. for (i__ = 1; i__ <= i__3; ++i__) {
  327. jj = k2 + i__;
  328. i__4 = j + jb - 1;
  329. for (ii = j + i__ - 1; ii <= i__4; ++ii) {
  330. ip = ipiv[ii];
  331. if (ip != ii) {
  332. temp = ab[kv + 1 + ii - jj + jj * ab_dim1];
  333. ab[kv + 1 + ii - jj + jj * ab_dim1] = ab[kv + 1 +
  334. ip - jj + jj * ab_dim1];
  335. ab[kv + 1 + ip - jj + jj * ab_dim1] = temp;
  336. }
  337. /* L100: */
  338. }
  339. /* L110: */
  340. }
  341. /* Update the relevant part of the trailing submatrix */
  342. if (j2 > 0) {
  343. /* Update A12 */
  344. i__3 = *ldab - 1;
  345. i__4 = *ldab - 1;
  346. _starpu_dtrsm_("Left", "Lower", "No transpose", "Unit", &jb, &j2,
  347. &c_b31, &ab[kv + 1 + j * ab_dim1], &i__3, &ab[kv
  348. + 1 - jb + (j + jb) * ab_dim1], &i__4);
  349. if (i2 > 0) {
  350. /* Update A22 */
  351. i__3 = *ldab - 1;
  352. i__4 = *ldab - 1;
  353. i__5 = *ldab - 1;
  354. _starpu_dgemm_("No transpose", "No transpose", &i2, &j2, &jb,
  355. &c_b18, &ab[kv + 1 + jb + j * ab_dim1], &i__3,
  356. &ab[kv + 1 - jb + (j + jb) * ab_dim1], &i__4,
  357. &c_b31, &ab[kv + 1 + (j + jb) * ab_dim1], &
  358. i__5);
  359. }
  360. if (i3 > 0) {
  361. /* Update A32 */
  362. i__3 = *ldab - 1;
  363. i__4 = *ldab - 1;
  364. _starpu_dgemm_("No transpose", "No transpose", &i3, &j2, &jb,
  365. &c_b18, work31, &c__65, &ab[kv + 1 - jb + (j
  366. + jb) * ab_dim1], &i__3, &c_b31, &ab[kv + *kl
  367. + 1 - jb + (j + jb) * ab_dim1], &i__4);
  368. }
  369. }
  370. if (j3 > 0) {
  371. /* Copy the lower triangle of A13 into the work array */
  372. /* WORK13 */
  373. i__3 = j3;
  374. for (jj = 1; jj <= i__3; ++jj) {
  375. i__4 = jb;
  376. for (ii = jj; ii <= i__4; ++ii) {
  377. work13[ii + jj * 65 - 66] = ab[ii - jj + 1 + (jj
  378. + j + kv - 1) * ab_dim1];
  379. /* L120: */
  380. }
  381. /* L130: */
  382. }
  383. /* Update A13 in the work array */
  384. i__3 = *ldab - 1;
  385. _starpu_dtrsm_("Left", "Lower", "No transpose", "Unit", &jb, &j3,
  386. &c_b31, &ab[kv + 1 + j * ab_dim1], &i__3, work13,
  387. &c__65);
  388. if (i2 > 0) {
  389. /* Update A23 */
  390. i__3 = *ldab - 1;
  391. i__4 = *ldab - 1;
  392. _starpu_dgemm_("No transpose", "No transpose", &i2, &j3, &jb,
  393. &c_b18, &ab[kv + 1 + jb + j * ab_dim1], &i__3,
  394. work13, &c__65, &c_b31, &ab[jb + 1 + (j + kv)
  395. * ab_dim1], &i__4);
  396. }
  397. if (i3 > 0) {
  398. /* Update A33 */
  399. i__3 = *ldab - 1;
  400. _starpu_dgemm_("No transpose", "No transpose", &i3, &j3, &jb,
  401. &c_b18, work31, &c__65, work13, &c__65, &
  402. c_b31, &ab[*kl + 1 + (j + kv) * ab_dim1], &
  403. i__3);
  404. }
  405. /* Copy the lower triangle of A13 back into place */
  406. i__3 = j3;
  407. for (jj = 1; jj <= i__3; ++jj) {
  408. i__4 = jb;
  409. for (ii = jj; ii <= i__4; ++ii) {
  410. ab[ii - jj + 1 + (jj + j + kv - 1) * ab_dim1] =
  411. work13[ii + jj * 65 - 66];
  412. /* L140: */
  413. }
  414. /* L150: */
  415. }
  416. }
  417. } else {
  418. /* Adjust the pivot indices. */
  419. i__3 = j + jb - 1;
  420. for (i__ = j; i__ <= i__3; ++i__) {
  421. ipiv[i__] = ipiv[i__] + j - 1;
  422. /* L160: */
  423. }
  424. }
  425. /* Partially undo the interchanges in the current block to */
  426. /* restore the upper triangular form of A31 and copy the upper */
  427. /* triangle of A31 back into place */
  428. i__3 = j;
  429. for (jj = j + jb - 1; jj >= i__3; --jj) {
  430. jp = ipiv[jj] - jj + 1;
  431. if (jp != 1) {
  432. /* Apply interchange to columns J to JJ-1 */
  433. if (jp + jj - 1 < j + *kl) {
  434. /* The interchange does not affect A31 */
  435. i__4 = jj - j;
  436. i__5 = *ldab - 1;
  437. i__6 = *ldab - 1;
  438. _starpu_dswap_(&i__4, &ab[kv + 1 + jj - j + j * ab_dim1], &
  439. i__5, &ab[kv + jp + jj - j + j * ab_dim1], &
  440. i__6);
  441. } else {
  442. /* The interchange does affect A31 */
  443. i__4 = jj - j;
  444. i__5 = *ldab - 1;
  445. _starpu_dswap_(&i__4, &ab[kv + 1 + jj - j + j * ab_dim1], &
  446. i__5, &work31[jp + jj - j - *kl - 1], &c__65);
  447. }
  448. }
  449. /* Copy the current column of A31 back into place */
  450. /* Computing MIN */
  451. i__4 = i3, i__5 = jj - j + 1;
  452. nw = min(i__4,i__5);
  453. if (nw > 0) {
  454. _starpu_dcopy_(&nw, &work31[(jj - j + 1) * 65 - 65], &c__1, &ab[
  455. kv + *kl + 1 - jj + j + jj * ab_dim1], &c__1);
  456. }
  457. /* L170: */
  458. }
  459. /* L180: */
  460. }
  461. }
  462. return 0;
  463. /* End of DGBTRF */
  464. } /* _starpu_dgbtrf_ */