PostgreSQL Source Code git master
Loading...
Searching...
No Matches
copyfromparse.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * copyfromparse.c
4 * Parse CSV/text/binary format for COPY FROM.
5 *
6 * This file contains routines to parse the text, CSV and binary input
7 * formats. The main entry point is NextCopyFrom(), which parses the
8 * next input line and returns it as Datums.
9 *
10 * In text/CSV mode, the parsing happens in multiple stages:
11 *
12 * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
13 * 1. 2. 3. 4.
14 *
15 * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16 * places it into 'raw_buf'.
17 *
18 * 2. CopyConvertBuf() calls the encoding conversion function to convert
19 * the data in 'raw_buf' from client to server encoding, placing the
20 * converted result in 'input_buf'.
21 *
22 * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23 * It is responsible for finding the next newline marker, taking quote and
24 * escape characters into account according to the COPY options. The line
25 * is copied into 'line_buf', with quotes and escape characters still
26 * intact.
27 *
28 * 4. CopyReadAttributesText/CSV() function takes the input line from
29 * 'line_buf', and splits it into fields, unescaping the data as required.
30 * The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31 * pointers to each field.
32 *
33 * If encoding conversion is not required, a shortcut is taken in step 2 to
34 * avoid copying the data unnecessarily. The 'input_buf' pointer is set to
35 * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36 * directly into 'input_buf'. CopyConvertBuf() then merely validates that
37 * the data is valid in the current encoding.
38 *
39 * In binary mode, the pipeline is much simpler. Input is loaded into
40 * 'raw_buf', and encoding conversion is done in the datatype-specific
41 * receive functions, if required. 'input_buf' and 'line_buf' are not used,
42 * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43 * data when it's passed the receive function.
44 *
45 * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also
46 * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf'
47 * and 'attribute_buf' are expanded on demand, to hold the longest line
48 * encountered so far.
49 *
50 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
51 * Portions Copyright (c) 1994, Regents of the University of California
52 *
53 *
54 * IDENTIFICATION
55 * src/backend/commands/copyfromparse.c
56 *
57 *-------------------------------------------------------------------------
58 */
59#include "postgres.h"
60
61#include <ctype.h>
62#include <unistd.h>
63#include <sys/stat.h>
64
65#include "commands/copyapi.h"
67#include "commands/progress.h"
68#include "executor/executor.h"
69#include "libpq/libpq.h"
70#include "libpq/pqformat.h"
71#include "mb/pg_wchar.h"
72#include "miscadmin.h"
73#include "pgstat.h"
74#include "port/pg_bswap.h"
75#include "utils/builtins.h"
76#include "utils/rel.h"
77
78#define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
79#define OCTVALUE(c) ((c) - '0')
80
81/*
82 * These macros centralize code used to process line_buf and input_buf buffers.
83 * They are macros because they often do continue/break control and to avoid
84 * function call overhead in tight COPY loops.
85 *
86 * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
87 * prevent the continue/break processing from working. We end the "if (1)"
88 * with "else ((void) 0)" to ensure the "if" does not unintentionally match
89 * any "else" in the calling code, and to avoid any compiler warnings about
90 * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
91 */
92
93/*
94 * This keeps the character read at the top of the loop in the buffer
95 * even if there is more than one read-ahead.
96 */
97#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
98if (1) \
99{ \
100 if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
101 { \
102 input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
103 need_data = true; \
104 continue; \
105 } \
106} else ((void) 0)
107
108/* This consumes the remainder of the buffer and breaks */
109#define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
110if (1) \
111{ \
112 if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
113 { \
114 if (extralen) \
115 input_buf_ptr = copy_buf_len; /* consume the partial character */ \
116 /* backslash just before EOF, treat as data char */ \
117 result = true; \
118 break; \
119 } \
120} else ((void) 0)
121
122/*
123 * Transfer any approved data to line_buf; must do this to be sure
124 * there is some room in input_buf.
125 */
126#define REFILL_LINEBUF \
127if (1) \
128{ \
129 if (input_buf_ptr > cstate->input_buf_index) \
130 { \
131 appendBinaryStringInfo(&cstate->line_buf, \
132 cstate->input_buf + cstate->input_buf_index, \
133 input_buf_ptr - cstate->input_buf_index); \
134 cstate->input_buf_index = input_buf_ptr; \
135 } \
136} else ((void) 0)
137
138/* NOTE: there's a copy of this in copyto.c */
139static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
140
141
142/* non-export function prototypes */
143static bool CopyReadLine(CopyFromState cstate, bool is_csv);
145 bool is_csv);
146static int CopyReadAttributesText(CopyFromState cstate);
147static int CopyReadAttributesCSV(CopyFromState cstate);
149 Oid typioparam, int32 typmod,
150 bool *isnull);
152 ExprContext *econtext,
153 Datum *values,
154 bool *nulls,
155 bool is_csv);
157 char ***fields,
158 int *nfields,
159 bool is_csv);
160
161
162/* Low-level communications functions */
163static int CopyGetData(CopyFromState cstate, void *databuf,
164 int minread, int maxread);
165static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
166static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
167static void CopyLoadInputBuf(CopyFromState cstate);
168static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
169
170void
172{
174 int natts = list_length(cstate->attnumlist);
175 int16 format = (cstate->opts.binary ? 1 : 0);
176 int i;
177
179 pq_sendbyte(&buf, format); /* overall format */
180 pq_sendint16(&buf, natts);
181 for (i = 0; i < natts; i++)
182 pq_sendint16(&buf, format); /* per-column formats */
184 cstate->copy_src = COPY_FRONTEND;
185 cstate->fe_msgbuf = makeStringInfo();
186 /* We *must* flush here to ensure FE knows it can send. */
187 pq_flush();
188}
189
190void
192{
193 char readSig[11];
194 int32 tmp;
195
196 /* Signature */
197 if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
198 memcmp(readSig, BinarySignature, 11) != 0)
201 errmsg("COPY file signature not recognized")));
202 /* Flags field */
203 if (!CopyGetInt32(cstate, &tmp))
206 errmsg("invalid COPY file header (missing flags)")));
207 if ((tmp & (1 << 16)) != 0)
210 errmsg("invalid COPY file header (WITH OIDS)")));
211 tmp &= ~(1 << 16);
212 if ((tmp >> 16) != 0)
215 errmsg("unrecognized critical flags in COPY file header")));
216 /* Header extension length */
217 if (!CopyGetInt32(cstate, &tmp) ||
218 tmp < 0)
221 errmsg("invalid COPY file header (missing length)")));
222 /* Skip extension header, if present */
223 while (tmp-- > 0)
224 {
225 if (CopyReadBinaryData(cstate, readSig, 1) != 1)
228 errmsg("invalid COPY file header (wrong length)")));
229 }
230}
231
232/*
233 * CopyGetData reads data from the source (file or frontend)
234 *
235 * We attempt to read at least minread, and at most maxread, bytes from
236 * the source. The actual number of bytes read is returned; if this is
237 * less than minread, EOF was detected.
238 *
239 * Note: when copying from the frontend, we expect a proper EOF mark per
240 * protocol; if the frontend simply drops the connection, we raise error.
241 * It seems unwise to allow the COPY IN to complete normally in that case.
242 *
243 * NB: no data conversion is applied here.
244 */
245static int
247{
248 int bytesread = 0;
249
250 switch (cstate->copy_src)
251 {
252 case COPY_FILE:
254 bytesread = fread(databuf, 1, maxread, cstate->copy_file);
256 if (ferror(cstate->copy_file))
259 errmsg("could not read from COPY file: %m")));
260 if (bytesread == 0)
261 cstate->raw_reached_eof = true;
262 break;
263 case COPY_FRONTEND:
264 while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
265 {
266 int avail;
267
268 while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
269 {
270 /* Try to receive another message */
271 int mtype;
272 int maxmsglen;
273
277 mtype = pq_getbyte();
278 if (mtype == EOF)
281 errmsg("unexpected EOF on client connection with an open transaction")));
282 /* Validate message type and set packet size limit */
283 switch (mtype)
284 {
285 case PqMsg_CopyData:
287 break;
288 case PqMsg_CopyDone:
289 case PqMsg_CopyFail:
290 case PqMsg_Flush:
291 case PqMsg_Sync:
293 break;
294 default:
297 errmsg("unexpected message type 0x%02X during COPY from stdin",
298 mtype)));
299 maxmsglen = 0; /* keep compiler quiet */
300 break;
301 }
302 /* Now collect the message body */
303 if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
306 errmsg("unexpected EOF on client connection with an open transaction")));
308 /* ... and process it */
309 switch (mtype)
310 {
311 case PqMsg_CopyData:
312 break;
313 case PqMsg_CopyDone:
314 /* COPY IN correctly terminated by frontend */
315 cstate->raw_reached_eof = true;
316 return bytesread;
317 case PqMsg_CopyFail:
320 errmsg("COPY from stdin failed: %s",
321 pq_getmsgstring(cstate->fe_msgbuf))));
322 break;
323 case PqMsg_Flush:
324 case PqMsg_Sync:
325
326 /*
327 * Ignore Flush/Sync for the convenience of client
328 * libraries (such as libpq) that may send those
329 * without noticing that the command they just
330 * sent was COPY.
331 */
332 goto readmessage;
333 default:
334 Assert(false); /* NOT REACHED */
335 }
336 }
337 avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
338 if (avail > maxread)
339 avail = maxread;
340 pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
341 databuf = (char *) databuf + avail;
342 maxread -= avail;
343 bytesread += avail;
344 }
345 break;
346 case COPY_CALLBACK:
348 break;
349 }
350
351 return bytesread;
352}
353
354
355/*
356 * These functions do apply some data conversion
357 */
358
359/*
360 * CopyGetInt32 reads an int32 that appears in network byte order
361 *
362 * Returns true if OK, false if EOF
363 */
364static inline bool
366{
367 uint32 buf;
368
369 if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
370 {
371 *val = 0; /* suppress compiler warning */
372 return false;
373 }
374 *val = (int32) pg_ntoh32(buf);
375 return true;
376}
377
378/*
379 * CopyGetInt16 reads an int16 that appears in network byte order
380 */
381static inline bool
383{
384 uint16 buf;
385
386 if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
387 {
388 *val = 0; /* suppress compiler warning */
389 return false;
390 }
391 *val = (int16) pg_ntoh16(buf);
392 return true;
393}
394
395
396/*
397 * Perform encoding conversion on data in 'raw_buf', writing the converted
398 * data into 'input_buf'.
399 *
400 * On entry, there must be some data to convert in 'raw_buf'.
401 */
402static void
404{
405 /*
406 * If the file and server encoding are the same, no encoding conversion is
407 * required. However, we still need to verify that the input is valid for
408 * the encoding.
409 */
410 if (!cstate->need_transcoding)
411 {
412 /*
413 * When conversion is not required, input_buf and raw_buf are the
414 * same. raw_buf_len is the total number of bytes in the buffer, and
415 * input_buf_len tracks how many of those bytes have already been
416 * verified.
417 */
418 int preverifiedlen = cstate->input_buf_len;
419 int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
420 int nverified;
421
422 if (unverifiedlen == 0)
423 {
424 /*
425 * If no more raw data is coming, report the EOF to the caller.
426 */
427 if (cstate->raw_reached_eof)
428 cstate->input_reached_eof = true;
429 return;
430 }
431
432 /*
433 * Verify the new data, including any residual unverified bytes from
434 * previous round.
435 */
437 cstate->raw_buf + preverifiedlen,
439 if (nverified == 0)
440 {
441 /*
442 * Could not verify anything.
443 *
444 * If there is no more raw input data coming, it means that there
445 * was an incomplete multi-byte sequence at the end. Also, if
446 * there's "enough" input left, we should be able to verify at
447 * least one character, and a failure to do so means that we've
448 * hit an invalid byte sequence.
449 */
451 cstate->input_reached_error = true;
452 return;
453 }
454 cstate->input_buf_len += nverified;
455 }
456 else
457 {
458 /*
459 * Encoding conversion is needed.
460 */
461 int nbytes;
462 unsigned char *src;
463 int srclen;
464 unsigned char *dst;
465 int dstlen;
466 int convertedlen;
467
468 if (RAW_BUF_BYTES(cstate) == 0)
469 {
470 /*
471 * If no more raw data is coming, report the EOF to the caller.
472 */
473 if (cstate->raw_reached_eof)
474 cstate->input_reached_eof = true;
475 return;
476 }
477
478 /*
479 * First, copy down any unprocessed data.
480 */
481 nbytes = INPUT_BUF_BYTES(cstate);
482 if (nbytes > 0 && cstate->input_buf_index > 0)
483 memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
484 nbytes);
485 cstate->input_buf_index = 0;
486 cstate->input_buf_len = nbytes;
487 cstate->input_buf[nbytes] = '\0';
488
489 src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
490 srclen = cstate->raw_buf_len - cstate->raw_buf_index;
491 dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
492 dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
493
494 /*
495 * Do the conversion. This might stop short, if there is an invalid
496 * byte sequence in the input. We'll convert as much as we can in
497 * that case.
498 *
499 * Note: Even if we hit an invalid byte sequence, we don't report the
500 * error until all the valid bytes have been consumed. The input
501 * might contain an end-of-input marker (\.), and we don't want to
502 * report an error if the invalid byte sequence is after the
503 * end-of-input marker. We might unnecessarily convert some data
504 * after the end-of-input marker as long as it's valid for the
505 * encoding, but that's harmless.
506 */
508 cstate->file_encoding,
510 src, srclen,
511 dst, dstlen,
512 true);
513 if (convertedlen == 0)
514 {
515 /*
516 * Could not convert anything. If there is no more raw input data
517 * coming, it means that there was an incomplete multi-byte
518 * sequence at the end. Also, if there is plenty of input left,
519 * we should be able to convert at least one character, so a
520 * failure to do so must mean that we've hit a byte sequence
521 * that's invalid.
522 */
524 cstate->input_reached_error = true;
525 return;
526 }
527 cstate->raw_buf_index += convertedlen;
528 cstate->input_buf_len += strlen((char *) dst);
529 }
530}
531
532/*
533 * Report an encoding or conversion error.
534 */
535static void
537{
538 Assert(cstate->raw_buf_len > 0);
540
541 if (!cstate->need_transcoding)
542 {
543 /*
544 * Everything up to input_buf_len was successfully verified, and
545 * input_buf_len points to the invalid or incomplete character.
546 */
548 cstate->raw_buf + cstate->input_buf_len,
549 cstate->raw_buf_len - cstate->input_buf_len);
550 }
551 else
552 {
553 /*
554 * raw_buf_index points to the invalid or untranslatable character. We
555 * let the conversion routine report the error, because it can provide
556 * a more specific error message than we could here. An earlier call
557 * to the conversion routine in CopyConvertBuf() detected that there
558 * is an error, now we call the conversion routine again with
559 * noError=false, to have it throw the error.
560 */
561 unsigned char *src;
562 int srclen;
563 unsigned char *dst;
564 int dstlen;
565
566 src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
567 srclen = cstate->raw_buf_len - cstate->raw_buf_index;
568 dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
569 dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
570
572 cstate->file_encoding,
574 src, srclen,
575 dst, dstlen,
576 false);
577
578 /*
579 * The conversion routine should have reported an error, so this
580 * should not be reached.
581 */
582 elog(ERROR, "encoding conversion failed without error");
583 }
584}
585
586/*
587 * Load more data from data source to raw_buf.
588 *
589 * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
590 * beginning of the buffer, and we load new data after that.
591 */
592static void
594{
595 int nbytes;
596 int inbytes;
597
598 /*
599 * In text mode, if encoding conversion is not required, raw_buf and
600 * input_buf point to the same buffer. Their len/index better agree, too.
601 */
602 if (cstate->raw_buf == cstate->input_buf)
603 {
604 Assert(!cstate->need_transcoding);
605 Assert(cstate->raw_buf_index == cstate->input_buf_index);
606 Assert(cstate->input_buf_len <= cstate->raw_buf_len);
607 }
608
609 /*
610 * Copy down the unprocessed data if any.
611 */
612 nbytes = RAW_BUF_BYTES(cstate);
613 if (nbytes > 0 && cstate->raw_buf_index > 0)
614 memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
615 nbytes);
616 cstate->raw_buf_len -= cstate->raw_buf_index;
617 cstate->raw_buf_index = 0;
618
619 /*
620 * If raw_buf and input_buf are in fact the same buffer, adjust the
621 * input_buf variables, too.
622 */
623 if (cstate->raw_buf == cstate->input_buf)
624 {
625 cstate->input_buf_len -= cstate->input_buf_index;
626 cstate->input_buf_index = 0;
627 }
628
629 /* Load more data */
630 inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
631 1, RAW_BUF_SIZE - cstate->raw_buf_len);
632 nbytes += inbytes;
633 cstate->raw_buf[nbytes] = '\0';
634 cstate->raw_buf_len = nbytes;
635
636 cstate->bytes_processed += inbytes;
638
639 if (inbytes == 0)
640 cstate->raw_reached_eof = true;
641}
642
643/*
644 * CopyLoadInputBuf loads some more data into input_buf
645 *
646 * On return, at least one more input character is loaded into
647 * input_buf, or input_reached_eof is set.
648 *
649 * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
650 * of the buffer and then we load more data after that.
651 */
652static void
654{
655 int nbytes = INPUT_BUF_BYTES(cstate);
656
657 /*
658 * The caller has updated input_buf_index to indicate how much of the
659 * input has been consumed and isn't needed anymore. If input_buf is the
660 * same physical area as raw_buf, update raw_buf_index accordingly.
661 */
662 if (cstate->raw_buf == cstate->input_buf)
663 {
664 Assert(!cstate->need_transcoding);
665 Assert(cstate->input_buf_index >= cstate->raw_buf_index);
666 cstate->raw_buf_index = cstate->input_buf_index;
667 }
668
669 for (;;)
670 {
671 /* If we now have some unconverted data, try to convert it */
672 CopyConvertBuf(cstate);
673
674 /* If we now have some more input bytes ready, return them */
675 if (INPUT_BUF_BYTES(cstate) > nbytes)
676 return;
677
678 /*
679 * If we reached an invalid byte sequence, or we're at an incomplete
680 * multi-byte character but there is no more raw input data, report
681 * conversion error.
682 */
683 if (cstate->input_reached_error)
684 CopyConversionError(cstate);
685
686 /* no more input, and everything has been converted */
687 if (cstate->input_reached_eof)
688 break;
689
690 /* Try to load more raw data */
691 Assert(!cstate->raw_reached_eof);
692 CopyLoadRawBuf(cstate);
693 }
694}
695
696/*
697 * CopyReadBinaryData
698 *
699 * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
700 * and writes them to 'dest'. Returns the number of bytes read (which
701 * would be less than 'nbytes' only if we reach EOF).
702 */
703static int
704CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
705{
706 int copied_bytes = 0;
707
708 if (RAW_BUF_BYTES(cstate) >= nbytes)
709 {
710 /* Enough bytes are present in the buffer. */
711 memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
712 cstate->raw_buf_index += nbytes;
713 copied_bytes = nbytes;
714 }
715 else
716 {
717 /*
718 * Not enough bytes in the buffer, so must read from the file. Need
719 * to loop since 'nbytes' could be larger than the buffer size.
720 */
721 do
722 {
723 int copy_bytes;
724
725 /* Load more data if buffer is empty. */
726 if (RAW_BUF_BYTES(cstate) == 0)
727 {
728 CopyLoadRawBuf(cstate);
729 if (cstate->raw_reached_eof)
730 break; /* EOF */
731 }
732
733 /* Transfer some bytes. */
734 copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
735 memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
736 cstate->raw_buf_index += copy_bytes;
737 dest += copy_bytes;
739 } while (copied_bytes < nbytes);
740 }
741
742 return copied_bytes;
743}
744
745/*
746 * This function is exposed for use by extensions that read raw fields in the
747 * next line. See NextCopyFromRawFieldsInternal() for details.
748 */
749bool
750NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
751{
752 return NextCopyFromRawFieldsInternal(cstate, fields, nfields,
753 cstate->opts.csv_mode);
754}
755
756/*
757 * Workhorse for NextCopyFromRawFields().
758 *
759 * Read raw fields in the next line for COPY FROM in text or csv mode. Return
760 * false if no more lines.
761 *
762 * An internal temporary buffer is returned via 'fields'. It is valid until
763 * the next call of the function. Since the function returns all raw fields
764 * in the input file, 'nfields' could be different from the number of columns
765 * in the relation.
766 *
767 * NOTE: force_not_null option are not applied to the returned fields.
768 *
769 * We use pg_attribute_always_inline to reduce function call overhead
770 * and to help compilers to optimize away the 'is_csv' condition when called
771 * by internal functions such as CopyFromTextLikeOneRow().
772 */
774NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv)
775{
776 int fldct;
777 bool done = false;
778
779 /* only available for text or csv input */
780 Assert(!cstate->opts.binary);
781
782 /* on input check that the header line is correct if needed */
783 if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_FALSE)
784 {
785 ListCell *cur;
786 TupleDesc tupDesc;
787 int lines_to_skip = cstate->opts.header_line;
788
789 /* If set to "match", one header line is skipped */
790 if (cstate->opts.header_line == COPY_HEADER_MATCH)
791 lines_to_skip = 1;
792
793 tupDesc = RelationGetDescr(cstate->rel);
794
795 for (int i = 0; i < lines_to_skip; i++)
796 {
797 cstate->cur_lineno++;
798 if ((done = CopyReadLine(cstate, is_csv)))
799 break;
800 }
801
802 if (cstate->opts.header_line == COPY_HEADER_MATCH)
803 {
804 int fldnum;
805
806 if (is_csv)
808 else
810
811 if (fldct != list_length(cstate->attnumlist))
814 errmsg("wrong number of fields in header line: got %d, expected %d",
815 fldct, list_length(cstate->attnumlist))));
816
817 fldnum = 0;
818 foreach(cur, cstate->attnumlist)
819 {
820 int attnum = lfirst_int(cur);
821 char *colName;
822 Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
823
824 Assert(fldnum < cstate->max_fields);
825
826 colName = cstate->raw_fields[fldnum++];
827 if (colName == NULL)
830 errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
831 fldnum, cstate->opts.null_print, NameStr(attr->attname))));
832
833 if (namestrcmp(&attr->attname, colName) != 0)
834 {
837 errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
838 fldnum, colName, NameStr(attr->attname))));
839 }
840 }
841 }
842
843 if (done)
844 return false;
845 }
846
847 cstate->cur_lineno++;
848
849 /* Actually read the line into memory here */
850 done = CopyReadLine(cstate, is_csv);
851
852 /*
853 * EOF at start of line means we're done. If we see EOF after some
854 * characters, we act as though it was newline followed by EOF, ie,
855 * process the line and then exit loop on next iteration.
856 */
857 if (done && cstate->line_buf.len == 0)
858 return false;
859
860 /* Parse the line into de-escaped field values */
861 if (is_csv)
863 else
865
866 *fields = cstate->raw_fields;
867 *nfields = fldct;
868 return true;
869}
870
871/*
872 * Read next tuple from file for COPY FROM. Return false if no more tuples.
873 *
874 * 'econtext' is used to evaluate default expression for each column that is
875 * either not read from the file or is using the DEFAULT option of COPY FROM.
876 * It can be NULL when no default values are used, i.e. when all columns are
877 * read from the file, and DEFAULT option is unset.
878 *
879 * 'values' and 'nulls' arrays must be the same length as columns of the
880 * relation passed to BeginCopyFrom. This function fills the arrays.
881 */
882bool
884 Datum *values, bool *nulls)
885{
886 TupleDesc tupDesc;
888 num_defaults = cstate->num_defaults;
889 int i;
890 int *defmap = cstate->defmap;
891 ExprState **defexprs = cstate->defexprs;
892
893 tupDesc = RelationGetDescr(cstate->rel);
894 num_phys_attrs = tupDesc->natts;
895
896 /* Initialize all values for row to NULL */
897 MemSet(values, 0, num_phys_attrs * sizeof(Datum));
898 MemSet(nulls, true, num_phys_attrs * sizeof(bool));
899 MemSet(cstate->defaults, false, num_phys_attrs * sizeof(bool));
900
901 /* Get one row from source */
902 if (!cstate->routine->CopyFromOneRow(cstate, econtext, values, nulls))
903 return false;
904
905 /*
906 * Now compute and insert any defaults available for the columns not
907 * provided by the input data. Anything not processed here or above will
908 * remain NULL.
909 */
910 for (i = 0; i < num_defaults; i++)
911 {
912 /*
913 * The caller must supply econtext and have switched into the
914 * per-tuple memory context in it.
915 */
916 Assert(econtext != NULL);
918
919 values[defmap[i]] = ExecEvalExpr(defexprs[defmap[i]], econtext,
920 &nulls[defmap[i]]);
921 }
922
923 return true;
924}
925
926/* Implementation of the per-row callback for text format */
927bool
929 bool *nulls)
930{
931 return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, false);
932}
933
934/* Implementation of the per-row callback for CSV format */
935bool
937 bool *nulls)
938{
939 return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, true);
940}
941
942/*
943 * Workhorse for CopyFromTextOneRow() and CopyFromCSVOneRow().
944 *
945 * We use pg_attribute_always_inline to reduce function call overhead
946 * and to help compilers to optimize away the 'is_csv' condition.
947 */
950 Datum *values, bool *nulls, bool is_csv)
951{
952 TupleDesc tupDesc;
954 FmgrInfo *in_functions = cstate->in_functions;
955 Oid *typioparams = cstate->typioparams;
956 ExprState **defexprs = cstate->defexprs;
957 char **field_strings;
958 ListCell *cur;
959 int fldct;
960 int fieldno;
961 char *string;
962
963 tupDesc = RelationGetDescr(cstate->rel);
965
966 /* read raw fields in the next line */
968 return false;
969
970 /* check for overflowing fields */
971 if (attr_count > 0 && fldct > attr_count)
974 errmsg("extra data after last expected column")));
975
976 fieldno = 0;
977
978 /* Loop to read the user attributes on the line. */
979 foreach(cur, cstate->attnumlist)
980 {
981 int attnum = lfirst_int(cur);
982 int m = attnum - 1;
983 Form_pg_attribute att = TupleDescAttr(tupDesc, m);
984
985 if (fieldno >= fldct)
988 errmsg("missing data for column \"%s\"",
989 NameStr(att->attname))));
990 string = field_strings[fieldno++];
991
992 if (cstate->convert_select_flags &&
993 !cstate->convert_select_flags[m])
994 {
995 /* ignore input field, leaving column as NULL */
996 continue;
997 }
998
999 if (is_csv)
1000 {
1001 if (string == NULL &&
1002 cstate->opts.force_notnull_flags[m])
1003 {
1004 /*
1005 * FORCE_NOT_NULL option is set and column is NULL - convert
1006 * it to the NULL string.
1007 */
1008 string = cstate->opts.null_print;
1009 }
1010 else if (string != NULL && cstate->opts.force_null_flags[m]
1011 && strcmp(string, cstate->opts.null_print) == 0)
1012 {
1013 /*
1014 * FORCE_NULL option is set and column matches the NULL
1015 * string. It must have been quoted, or otherwise the string
1016 * would already have been set to NULL. Convert it to NULL as
1017 * specified.
1018 */
1019 string = NULL;
1020 }
1021 }
1022
1023 cstate->cur_attname = NameStr(att->attname);
1024 cstate->cur_attval = string;
1025
1026 if (string != NULL)
1027 nulls[m] = false;
1028
1029 if (cstate->defaults[m])
1030 {
1031 /* We must have switched into the per-tuple memory context */
1032 Assert(econtext != NULL);
1034
1035 values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]);
1036 }
1037
1038 /*
1039 * If ON_ERROR is specified with IGNORE, skip rows with soft errors
1040 */
1041 else if (!InputFunctionCallSafe(&in_functions[m],
1042 string,
1043 typioparams[m],
1044 att->atttypmod,
1045 (Node *) cstate->escontext,
1046 &values[m]))
1047 {
1049
1050 cstate->num_errors++;
1051
1053 {
1054 /*
1055 * Since we emit line number and column info in the below
1056 * notice message, we suppress error context information other
1057 * than the relation name.
1058 */
1059 Assert(!cstate->relname_only);
1060 cstate->relname_only = true;
1061
1062 if (cstate->cur_attval)
1063 {
1064 char *attval;
1065
1068 errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1069 cstate->cur_lineno,
1070 cstate->cur_attname,
1071 attval));
1072 pfree(attval);
1073 }
1074 else
1076 errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": null input",
1077 cstate->cur_lineno,
1078 cstate->cur_attname));
1079
1080 /* reset relname_only */
1081 cstate->relname_only = false;
1082 }
1083
1084 return true;
1085 }
1086
1087 cstate->cur_attname = NULL;
1088 cstate->cur_attval = NULL;
1089 }
1090
1092
1093 return true;
1094}
1095
1096/* Implementation of the per-row callback for binary format */
1097bool
1099 bool *nulls)
1100{
1101 TupleDesc tupDesc;
1103 FmgrInfo *in_functions = cstate->in_functions;
1104 Oid *typioparams = cstate->typioparams;
1106 ListCell *cur;
1107
1108 tupDesc = RelationGetDescr(cstate->rel);
1110
1111 cstate->cur_lineno++;
1112
1113 if (!CopyGetInt16(cstate, &fld_count))
1114 {
1115 /* EOF detected (end of file, or protocol-level EOF) */
1116 return false;
1117 }
1118
1119 if (fld_count == -1)
1120 {
1121 /*
1122 * Received EOF marker. Wait for the protocol-level EOF, and complain
1123 * if it doesn't come immediately. In COPY FROM STDIN, this ensures
1124 * that we correctly handle CopyFail, if client chooses to send that
1125 * now. When copying from file, we could ignore the rest of the file
1126 * like in text mode, but we choose to be consistent with the COPY
1127 * FROM STDIN case.
1128 */
1129 char dummy;
1130
1131 if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
1132 ereport(ERROR,
1134 errmsg("received copy data after EOF marker")));
1135 return false;
1136 }
1137
1138 if (fld_count != attr_count)
1139 ereport(ERROR,
1141 errmsg("row field count is %d, expected %d",
1143
1144 foreach(cur, cstate->attnumlist)
1145 {
1146 int attnum = lfirst_int(cur);
1147 int m = attnum - 1;
1148 Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1149
1150 cstate->cur_attname = NameStr(att->attname);
1151 values[m] = CopyReadBinaryAttribute(cstate,
1152 &in_functions[m],
1153 typioparams[m],
1154 att->atttypmod,
1155 &nulls[m]);
1156 cstate->cur_attname = NULL;
1157 }
1158
1159 return true;
1160}
1161
1162/*
1163 * Read the next input line and stash it in line_buf.
1164 *
1165 * Result is true if read was terminated by EOF, false if terminated
1166 * by newline. The terminating newline or EOF marker is not included
1167 * in the final value of line_buf.
1168 */
1169static bool
1171{
1172 bool result;
1173
1174 resetStringInfo(&cstate->line_buf);
1175 cstate->line_buf_valid = false;
1176
1177 /*
1178 * Parse data and transfer into line_buf.
1179 *
1180 * Because this is performance critical, we inline CopyReadLineText() and
1181 * pass the boolean parameters as constants to allow the compiler to emit
1182 * specialized code with fewer branches.
1183 */
1184 if (is_csv)
1185 result = CopyReadLineText(cstate, true);
1186 else
1187 result = CopyReadLineText(cstate, false);
1188
1189 if (result)
1190 {
1191 /*
1192 * Reached EOF. In protocol version 3, we should ignore anything
1193 * after \. up to the protocol end of copy data. (XXX maybe better
1194 * not to treat \. as special?)
1195 */
1196 if (cstate->copy_src == COPY_FRONTEND)
1197 {
1198 int inbytes;
1199
1200 do
1201 {
1202 inbytes = CopyGetData(cstate, cstate->input_buf,
1203 1, INPUT_BUF_SIZE);
1204 } while (inbytes > 0);
1205 cstate->input_buf_index = 0;
1206 cstate->input_buf_len = 0;
1207 cstate->raw_buf_index = 0;
1208 cstate->raw_buf_len = 0;
1209 }
1210 }
1211 else
1212 {
1213 /*
1214 * If we didn't hit EOF, then we must have transferred the EOL marker
1215 * to line_buf along with the data. Get rid of it.
1216 */
1217 switch (cstate->eol_type)
1218 {
1219 case EOL_NL:
1220 Assert(cstate->line_buf.len >= 1);
1221 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1222 cstate->line_buf.len--;
1223 cstate->line_buf.data[cstate->line_buf.len] = '\0';
1224 break;
1225 case EOL_CR:
1226 Assert(cstate->line_buf.len >= 1);
1227 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
1228 cstate->line_buf.len--;
1229 cstate->line_buf.data[cstate->line_buf.len] = '\0';
1230 break;
1231 case EOL_CRNL:
1232 Assert(cstate->line_buf.len >= 2);
1233 Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1234 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1235 cstate->line_buf.len -= 2;
1236 cstate->line_buf.data[cstate->line_buf.len] = '\0';
1237 break;
1238 case EOL_UNKNOWN:
1239 /* shouldn't get here */
1240 Assert(false);
1241 break;
1242 }
1243 }
1244
1245 /* Now it's safe to use the buffer in error messages */
1246 cstate->line_buf_valid = true;
1247
1248 return result;
1249}
1250
1251/*
1252 * CopyReadLineText - inner loop of CopyReadLine for text mode
1253 */
1256{
1257 char *copy_input_buf;
1258 int input_buf_ptr;
1259 int copy_buf_len;
1260 bool need_data = false;
1261 bool hit_eof = false;
1262 bool result = false;
1263
1264 /* CSV variables */
1265 bool in_quote = false,
1266 last_was_esc = false;
1267 char quotec = '\0';
1268 char escapec = '\0';
1269
1270 if (is_csv)
1271 {
1272 quotec = cstate->opts.quote[0];
1273 escapec = cstate->opts.escape[0];
1274 /* ignore special escape processing if it's the same as quotec */
1275 if (quotec == escapec)
1276 escapec = '\0';
1277 }
1278
1279 /*
1280 * The objective of this loop is to transfer the entire next input line
1281 * into line_buf. Hence, we only care for detecting newlines (\r and/or
1282 * \n) and the end-of-copy marker (\.).
1283 *
1284 * In CSV mode, \r and \n inside a quoted field are just part of the data
1285 * value and are put in line_buf. We keep just enough state to know if we
1286 * are currently in a quoted field or not.
1287 *
1288 * The input has already been converted to the database encoding. All
1289 * supported server encodings have the property that all bytes in a
1290 * multi-byte sequence have the high bit set, so a multibyte character
1291 * cannot contain any newline or escape characters embedded in the
1292 * multibyte sequence. Therefore, we can process the input byte-by-byte,
1293 * regardless of the encoding.
1294 *
1295 * For speed, we try to move data from input_buf to line_buf in chunks
1296 * rather than one character at a time. input_buf_ptr points to the next
1297 * character to examine; any characters from input_buf_index to
1298 * input_buf_ptr have been determined to be part of the line, but not yet
1299 * transferred to line_buf.
1300 *
1301 * For a little extra speed within the loop, we copy input_buf and
1302 * input_buf_len into local variables.
1303 */
1304 copy_input_buf = cstate->input_buf;
1306 copy_buf_len = cstate->input_buf_len;
1307
1308 for (;;)
1309 {
1310 int prev_raw_ptr;
1311 char c;
1312
1313 /*
1314 * Load more data if needed.
1315 *
1316 * TODO: We could just force four bytes of read-ahead and avoid the
1317 * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(). That was
1318 * unsafe with the old v2 COPY protocol, but we don't support that
1319 * anymore.
1320 */
1322 {
1324
1325 CopyLoadInputBuf(cstate);
1326 /* update our local variables */
1327 hit_eof = cstate->input_reached_eof;
1329 copy_buf_len = cstate->input_buf_len;
1330
1331 /*
1332 * If we are completely out of data, break out of the loop,
1333 * reporting EOF.
1334 */
1335 if (INPUT_BUF_BYTES(cstate) <= 0)
1336 {
1337 result = true;
1338 break;
1339 }
1340 need_data = false;
1341 }
1342
1343 /* OK to fetch a character */
1346
1347 if (is_csv)
1348 {
1349 /*
1350 * If character is '\r', we may need to look ahead below. Force
1351 * fetch of the next character if we don't already have it. We
1352 * need to do this before changing CSV state, in case '\r' is also
1353 * the quote or escape character.
1354 */
1355 if (c == '\r')
1356 {
1358 }
1359
1360 /*
1361 * Dealing with quotes and escapes here is mildly tricky. If the
1362 * quote char is also the escape char, there's no problem - we
1363 * just use the char as a toggle. If they are different, we need
1364 * to ensure that we only take account of an escape inside a
1365 * quoted field and immediately preceding a quote char, and not
1366 * the second in an escape-escape sequence.
1367 */
1368 if (in_quote && c == escapec)
1370 if (c == quotec && !last_was_esc)
1371 in_quote = !in_quote;
1372 if (c != escapec)
1373 last_was_esc = false;
1374
1375 /*
1376 * Updating the line count for embedded CR and/or LF chars is
1377 * necessarily a little fragile - this test is probably about the
1378 * best we can do. (XXX it's arguable whether we should do this
1379 * at all --- is cur_lineno a physical or logical count?)
1380 */
1381 if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
1382 cstate->cur_lineno++;
1383 }
1384
1385 /* Process \r */
1386 if (c == '\r' && (!is_csv || !in_quote))
1387 {
1388 /* Check for \r\n on first line, _and_ handle \r\n. */
1389 if (cstate->eol_type == EOL_UNKNOWN ||
1390 cstate->eol_type == EOL_CRNL)
1391 {
1392 /*
1393 * If need more data, go back to loop top to load it.
1394 *
1395 * Note that if we are at EOF, c will wind up as '\0' because
1396 * of the guaranteed pad of input_buf.
1397 */
1399
1400 /* get next char */
1402
1403 if (c == '\n')
1404 {
1405 input_buf_ptr++; /* eat newline */
1406 cstate->eol_type = EOL_CRNL; /* in case not set yet */
1407 }
1408 else
1409 {
1410 /* found \r, but no \n */
1411 if (cstate->eol_type == EOL_CRNL)
1412 ereport(ERROR,
1414 !is_csv ?
1415 errmsg("literal carriage return found in data") :
1416 errmsg("unquoted carriage return found in data"),
1417 !is_csv ?
1418 errhint("Use \"\\r\" to represent carriage return.") :
1419 errhint("Use quoted CSV field to represent carriage return.")));
1420
1421 /*
1422 * if we got here, it is the first line and we didn't find
1423 * \n, so don't consume the peeked character
1424 */
1425 cstate->eol_type = EOL_CR;
1426 }
1427 }
1428 else if (cstate->eol_type == EOL_NL)
1429 ereport(ERROR,
1431 !is_csv ?
1432 errmsg("literal carriage return found in data") :
1433 errmsg("unquoted carriage return found in data"),
1434 !is_csv ?
1435 errhint("Use \"\\r\" to represent carriage return.") :
1436 errhint("Use quoted CSV field to represent carriage return.")));
1437 /* If reach here, we have found the line terminator */
1438 break;
1439 }
1440
1441 /* Process \n */
1442 if (c == '\n' && (!is_csv || !in_quote))
1443 {
1444 if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
1445 ereport(ERROR,
1447 !is_csv ?
1448 errmsg("literal newline found in data") :
1449 errmsg("unquoted newline found in data"),
1450 !is_csv ?
1451 errhint("Use \"\\n\" to represent newline.") :
1452 errhint("Use quoted CSV field to represent newline.")));
1453 cstate->eol_type = EOL_NL; /* in case not set yet */
1454 /* If reach here, we have found the line terminator */
1455 break;
1456 }
1457
1458 /*
1459 * Process backslash, except in CSV mode where backslash is a normal
1460 * character.
1461 */
1462 if (c == '\\' && !is_csv)
1463 {
1464 char c2;
1465
1468
1469 /* -----
1470 * get next character
1471 * Note: we do not change c so if it isn't \., we can fall
1472 * through and continue processing.
1473 * -----
1474 */
1476
1477 if (c2 == '.')
1478 {
1479 input_buf_ptr++; /* consume the '.' */
1480 if (cstate->eol_type == EOL_CRNL)
1481 {
1482 /* Get the next character */
1484 /* if hit_eof, c2 will become '\0' */
1486
1487 if (c2 == '\n')
1488 ereport(ERROR,
1490 errmsg("end-of-copy marker does not match previous newline style")));
1491 else if (c2 != '\r')
1492 ereport(ERROR,
1494 errmsg("end-of-copy marker is not alone on its line")));
1495 }
1496
1497 /* Get the next character */
1499 /* if hit_eof, c2 will become '\0' */
1501
1502 if (c2 != '\r' && c2 != '\n')
1503 ereport(ERROR,
1505 errmsg("end-of-copy marker is not alone on its line")));
1506
1507 if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
1508 (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1509 (cstate->eol_type == EOL_CR && c2 != '\r'))
1510 ereport(ERROR,
1512 errmsg("end-of-copy marker does not match previous newline style")));
1513
1514 /*
1515 * If there is any data on this line before the \., complain.
1516 */
1517 if (cstate->line_buf.len > 0 ||
1518 prev_raw_ptr > cstate->input_buf_index)
1519 ereport(ERROR,
1521 errmsg("end-of-copy marker is not alone on its line")));
1522
1523 /*
1524 * Discard the \. and newline, then report EOF.
1525 */
1527 result = true; /* report EOF */
1528 break;
1529 }
1530 else
1531 {
1532 /*
1533 * If we are here, it means we found a backslash followed by
1534 * something other than a period. In non-CSV mode, anything
1535 * after a backslash is special, so we skip over that second
1536 * character too. If we didn't do that \\. would be
1537 * considered an eof-of copy, while in non-CSV mode it is a
1538 * literal backslash followed by a period.
1539 */
1540 input_buf_ptr++;
1541 }
1542 }
1543 } /* end of outer loop */
1544
1545 /*
1546 * Transfer any still-uncopied data to line_buf.
1547 */
1549
1550 return result;
1551}
1552
1553/*
1554 * Return decimal value for a hexadecimal digit
1555 */
1556static int
1558{
1559 if (isdigit((unsigned char) hex))
1560 return hex - '0';
1561 else
1562 return pg_ascii_tolower((unsigned char) hex) - 'a' + 10;
1563}
1564
1565/*
1566 * Parse the current line into separate attributes (fields),
1567 * performing de-escaping as needed.
1568 *
1569 * The input is in line_buf. We use attribute_buf to hold the result
1570 * strings. cstate->raw_fields[k] is set to point to the k'th attribute
1571 * string, or NULL when the input matches the null marker string.
1572 * This array is expanded as necessary.
1573 *
1574 * (Note that the caller cannot check for nulls since the returned
1575 * string would be the post-de-escaping equivalent, which may look
1576 * the same as some valid data string.)
1577 *
1578 * delim is the column delimiter string (must be just one byte for now).
1579 * null_print is the null marker string. Note that this is compared to
1580 * the pre-de-escaped input string.
1581 *
1582 * The return value is the number of fields actually read.
1583 */
1584static int
1586{
1587 char delimc = cstate->opts.delim[0];
1588 int fieldno;
1589 char *output_ptr;
1590 char *cur_ptr;
1591 char *line_end_ptr;
1592
1593 /*
1594 * We need a special case for zero-column tables: check that the input
1595 * line is empty, and return.
1596 */
1597 if (cstate->max_fields <= 0)
1598 {
1599 if (cstate->line_buf.len != 0)
1600 ereport(ERROR,
1602 errmsg("extra data after last expected column")));
1603 return 0;
1604 }
1605
1607
1608 /*
1609 * The de-escaped attributes will certainly not be longer than the input
1610 * data line, so we can just force attribute_buf to be large enough and
1611 * then transfer data without any checks for enough space. We need to do
1612 * it this way because enlarging attribute_buf mid-stream would invalidate
1613 * pointers already stored into cstate->raw_fields[].
1614 */
1615 if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1616 enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1617 output_ptr = cstate->attribute_buf.data;
1618
1619 /* set pointer variables for loop */
1620 cur_ptr = cstate->line_buf.data;
1621 line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1622
1623 /* Outer loop iterates over fields */
1624 fieldno = 0;
1625 for (;;)
1626 {
1627 bool found_delim = false;
1628 char *start_ptr;
1629 char *end_ptr;
1630 int input_len;
1631 bool saw_non_ascii = false;
1632
1633 /* Make sure there is enough space for the next value */
1634 if (fieldno >= cstate->max_fields)
1635 {
1636 cstate->max_fields *= 2;
1637 cstate->raw_fields =
1638 repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1639 }
1640
1641 /* Remember start of field on both input and output sides */
1643 cstate->raw_fields[fieldno] = output_ptr;
1644
1645 /*
1646 * Scan data for field.
1647 *
1648 * Note that in this loop, we are scanning to locate the end of field
1649 * and also speculatively performing de-escaping. Once we find the
1650 * end-of-field, we can match the raw field contents against the null
1651 * marker string. Only after that comparison fails do we know that
1652 * de-escaping is actually the right thing to do; therefore we *must
1653 * not* throw any syntax errors before we've done the null-marker
1654 * check.
1655 */
1656 for (;;)
1657 {
1658 char c;
1659
1660 end_ptr = cur_ptr;
1661 if (cur_ptr >= line_end_ptr)
1662 break;
1663 c = *cur_ptr++;
1664 if (c == delimc)
1665 {
1666 found_delim = true;
1667 break;
1668 }
1669 if (c == '\\')
1670 {
1671 if (cur_ptr >= line_end_ptr)
1672 break;
1673 c = *cur_ptr++;
1674 switch (c)
1675 {
1676 case '0':
1677 case '1':
1678 case '2':
1679 case '3':
1680 case '4':
1681 case '5':
1682 case '6':
1683 case '7':
1684 {
1685 /* handle \013 */
1686 int val;
1687
1688 val = OCTVALUE(c);
1689 if (cur_ptr < line_end_ptr)
1690 {
1691 c = *cur_ptr;
1692 if (ISOCTAL(c))
1693 {
1694 cur_ptr++;
1695 val = (val << 3) + OCTVALUE(c);
1696 if (cur_ptr < line_end_ptr)
1697 {
1698 c = *cur_ptr;
1699 if (ISOCTAL(c))
1700 {
1701 cur_ptr++;
1702 val = (val << 3) + OCTVALUE(c);
1703 }
1704 }
1705 }
1706 }
1707 c = val & 0377;
1708 if (c == '\0' || IS_HIGHBIT_SET(c))
1709 saw_non_ascii = true;
1710 }
1711 break;
1712 case 'x':
1713 /* Handle \x3F */
1714 if (cur_ptr < line_end_ptr)
1715 {
1716 char hexchar = *cur_ptr;
1717
1718 if (isxdigit((unsigned char) hexchar))
1719 {
1721
1722 cur_ptr++;
1723 if (cur_ptr < line_end_ptr)
1724 {
1725 hexchar = *cur_ptr;
1726 if (isxdigit((unsigned char) hexchar))
1727 {
1728 cur_ptr++;
1729 val = (val << 4) + GetDecimalFromHex(hexchar);
1730 }
1731 }
1732 c = val & 0xff;
1733 if (c == '\0' || IS_HIGHBIT_SET(c))
1734 saw_non_ascii = true;
1735 }
1736 }
1737 break;
1738 case 'b':
1739 c = '\b';
1740 break;
1741 case 'f':
1742 c = '\f';
1743 break;
1744 case 'n':
1745 c = '\n';
1746 break;
1747 case 'r':
1748 c = '\r';
1749 break;
1750 case 't':
1751 c = '\t';
1752 break;
1753 case 'v':
1754 c = '\v';
1755 break;
1756
1757 /*
1758 * in all other cases, take the char after '\'
1759 * literally
1760 */
1761 }
1762 }
1763
1764 /* Add c to output string */
1765 *output_ptr++ = c;
1766 }
1767
1768 /* Check whether raw input matched null marker */
1769 input_len = end_ptr - start_ptr;
1770 if (input_len == cstate->opts.null_print_len &&
1771 strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1772 cstate->raw_fields[fieldno] = NULL;
1773 /* Check whether raw input matched default marker */
1774 else if (fieldno < list_length(cstate->attnumlist) &&
1775 cstate->opts.default_print &&
1776 input_len == cstate->opts.default_print_len &&
1777 strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1778 {
1779 /* fieldno is 0-indexed and attnum is 1-indexed */
1780 int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1781
1782 if (cstate->defexprs[m] != NULL)
1783 {
1784 /* defaults contain entries for all physical attributes */
1785 cstate->defaults[m] = true;
1786 }
1787 else
1788 {
1789 TupleDesc tupDesc = RelationGetDescr(cstate->rel);
1790 Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1791
1792 ereport(ERROR,
1794 errmsg("unexpected default marker in COPY data"),
1795 errdetail("Column \"%s\" has no default value.",
1796 NameStr(att->attname))));
1797 }
1798 }
1799 else
1800 {
1801 /*
1802 * At this point we know the field is supposed to contain data.
1803 *
1804 * If we de-escaped any non-7-bit-ASCII chars, make sure the
1805 * resulting string is valid data for the db encoding.
1806 */
1807 if (saw_non_ascii)
1808 {
1809 char *fld = cstate->raw_fields[fieldno];
1810
1811 pg_verifymbstr(fld, output_ptr - fld, false);
1812 }
1813 }
1814
1815 /* Terminate attribute value in output area */
1816 *output_ptr++ = '\0';
1817
1818 fieldno++;
1819 /* Done if we hit EOL instead of a delim */
1820 if (!found_delim)
1821 break;
1822 }
1823
1824 /* Clean up state of attribute_buf */
1825 output_ptr--;
1826 Assert(*output_ptr == '\0');
1827 cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1828
1829 return fieldno;
1830}
1831
1832/*
1833 * Parse the current line into separate attributes (fields),
1834 * performing de-escaping as needed. This has exactly the same API as
1835 * CopyReadAttributesText, except we parse the fields according to
1836 * "standard" (i.e. common) CSV usage.
1837 */
1838static int
1840{
1841 char delimc = cstate->opts.delim[0];
1842 char quotec = cstate->opts.quote[0];
1843 char escapec = cstate->opts.escape[0];
1844 int fieldno;
1845 char *output_ptr;
1846 char *cur_ptr;
1847 char *line_end_ptr;
1848
1849 /*
1850 * We need a special case for zero-column tables: check that the input
1851 * line is empty, and return.
1852 */
1853 if (cstate->max_fields <= 0)
1854 {
1855 if (cstate->line_buf.len != 0)
1856 ereport(ERROR,
1858 errmsg("extra data after last expected column")));
1859 return 0;
1860 }
1861
1863
1864 /*
1865 * The de-escaped attributes will certainly not be longer than the input
1866 * data line, so we can just force attribute_buf to be large enough and
1867 * then transfer data without any checks for enough space. We need to do
1868 * it this way because enlarging attribute_buf mid-stream would invalidate
1869 * pointers already stored into cstate->raw_fields[].
1870 */
1871 if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1872 enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1873 output_ptr = cstate->attribute_buf.data;
1874
1875 /* set pointer variables for loop */
1876 cur_ptr = cstate->line_buf.data;
1877 line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1878
1879 /* Outer loop iterates over fields */
1880 fieldno = 0;
1881 for (;;)
1882 {
1883 bool found_delim = false;
1884 bool saw_quote = false;
1885 char *start_ptr;
1886 char *end_ptr;
1887 int input_len;
1888
1889 /* Make sure there is enough space for the next value */
1890 if (fieldno >= cstate->max_fields)
1891 {
1892 cstate->max_fields *= 2;
1893 cstate->raw_fields =
1894 repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1895 }
1896
1897 /* Remember start of field on both input and output sides */
1899 cstate->raw_fields[fieldno] = output_ptr;
1900
1901 /*
1902 * Scan data for field,
1903 *
1904 * The loop starts in "not quote" mode and then toggles between that
1905 * and "in quote" mode. The loop exits normally if it is in "not
1906 * quote" mode and a delimiter or line end is seen.
1907 */
1908 for (;;)
1909 {
1910 char c;
1911
1912 /* Not in quote */
1913 for (;;)
1914 {
1915 end_ptr = cur_ptr;
1916 if (cur_ptr >= line_end_ptr)
1917 goto endfield;
1918 c = *cur_ptr++;
1919 /* unquoted field delimiter */
1920 if (c == delimc)
1921 {
1922 found_delim = true;
1923 goto endfield;
1924 }
1925 /* start of quoted field (or part of field) */
1926 if (c == quotec)
1927 {
1928 saw_quote = true;
1929 break;
1930 }
1931 /* Add c to output string */
1932 *output_ptr++ = c;
1933 }
1934
1935 /* In quote */
1936 for (;;)
1937 {
1938 end_ptr = cur_ptr;
1939 if (cur_ptr >= line_end_ptr)
1940 ereport(ERROR,
1942 errmsg("unterminated CSV quoted field")));
1943
1944 c = *cur_ptr++;
1945
1946 /* escape within a quoted field */
1947 if (c == escapec)
1948 {
1949 /*
1950 * peek at the next char if available, and escape it if it
1951 * is an escape char or a quote char
1952 */
1953 if (cur_ptr < line_end_ptr)
1954 {
1955 char nextc = *cur_ptr;
1956
1957 if (nextc == escapec || nextc == quotec)
1958 {
1959 *output_ptr++ = nextc;
1960 cur_ptr++;
1961 continue;
1962 }
1963 }
1964 }
1965
1966 /*
1967 * end of quoted field. Must do this test after testing for
1968 * escape in case quote char and escape char are the same
1969 * (which is the common case).
1970 */
1971 if (c == quotec)
1972 break;
1973
1974 /* Add c to output string */
1975 *output_ptr++ = c;
1976 }
1977 }
1978endfield:
1979
1980 /* Terminate attribute value in output area */
1981 *output_ptr++ = '\0';
1982
1983 /* Check whether raw input matched null marker */
1984 input_len = end_ptr - start_ptr;
1985 if (!saw_quote && input_len == cstate->opts.null_print_len &&
1986 strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1987 cstate->raw_fields[fieldno] = NULL;
1988 /* Check whether raw input matched default marker */
1989 else if (fieldno < list_length(cstate->attnumlist) &&
1990 cstate->opts.default_print &&
1991 input_len == cstate->opts.default_print_len &&
1992 strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1993 {
1994 /* fieldno is 0-index and attnum is 1-index */
1995 int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1996
1997 if (cstate->defexprs[m] != NULL)
1998 {
1999 /* defaults contain entries for all physical attributes */
2000 cstate->defaults[m] = true;
2001 }
2002 else
2003 {
2004 TupleDesc tupDesc = RelationGetDescr(cstate->rel);
2005 Form_pg_attribute att = TupleDescAttr(tupDesc, m);
2006
2007 ereport(ERROR,
2009 errmsg("unexpected default marker in COPY data"),
2010 errdetail("Column \"%s\" has no default value.",
2011 NameStr(att->attname))));
2012 }
2013 }
2014
2015 fieldno++;
2016 /* Done if we hit EOL instead of a delim */
2017 if (!found_delim)
2018 break;
2019 }
2020
2021 /* Clean up state of attribute_buf */
2022 output_ptr--;
2023 Assert(*output_ptr == '\0');
2024 cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
2025
2026 return fieldno;
2027}
2028
2029
2030/*
2031 * Read a binary attribute
2032 */
2033static Datum
2035 Oid typioparam, int32 typmod,
2036 bool *isnull)
2037{
2039 Datum result;
2040
2041 if (!CopyGetInt32(cstate, &fld_size))
2042 ereport(ERROR,
2044 errmsg("unexpected EOF in COPY data")));
2045 if (fld_size == -1)
2046 {
2047 *isnull = true;
2048 return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
2049 }
2050 if (fld_size < 0)
2051 ereport(ERROR,
2053 errmsg("invalid field size")));
2054
2055 /* reset attribute_buf to empty, and load raw data in it */
2057
2059 if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
2060 fld_size) != fld_size)
2061 ereport(ERROR,
2063 errmsg("unexpected EOF in COPY data")));
2064
2065 cstate->attribute_buf.len = fld_size;
2066 cstate->attribute_buf.data[fld_size] = '\0';
2067
2068 /* Call the column type's binary input converter */
2069 result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
2070 typioparam, typmod);
2071
2072 /* Trouble if it didn't eat the whole buffer */
2073 if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
2074 ereport(ERROR,
2076 errmsg("incorrect binary data format")));
2077
2078 *isnull = false;
2079 return result;
2080}
int16 AttrNumber
Definition attnum.h:21
void pgstat_progress_update_param(int index, int64 val)
static Datum values[MAXATTR]
Definition bootstrap.c:147
#define NameStr(name)
Definition c.h:777
#define Min(x, y)
Definition c.h:1019
#define IS_HIGHBIT_SET(ch)
Definition c.h:1172
#define Assert(condition)
Definition c.h:885
#define pg_attribute_always_inline
Definition c.h:291
int16_t int16
Definition c.h:553
int32_t int32
Definition c.h:554
uint16_t uint16
Definition c.h:557
uint32_t uint32
Definition c.h:558
#define MemSet(start, val, len)
Definition c.h:1035
char * CopyLimitPrintoutLength(const char *str)
Definition copyfrom.c:333
#define RAW_BUF_BYTES(cstate)
#define INPUT_BUF_SIZE
@ EOL_CR
@ EOL_CRNL
@ EOL_UNKNOWN
@ EOL_NL
#define INPUT_BUF_BYTES(cstate)
#define RAW_BUF_SIZE
static pg_attribute_always_inline bool CopyFromTextLikeOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls, bool is_csv)
static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv)
bool CopyFromTextOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
bool CopyFromCSVOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
static int CopyReadAttributesCSV(CopyFromState cstate)
static bool CopyGetInt16(CopyFromState cstate, int16 *val)
static void CopyConversionError(CopyFromState cstate)
static bool CopyGetInt32(CopyFromState cstate, int32 *val)
static pg_attribute_always_inline bool CopyReadLineText(CopyFromState cstate, bool is_csv)
static void CopyLoadRawBuf(CopyFromState cstate)
#define OCTVALUE(c)
#define REFILL_LINEBUF
static void CopyLoadInputBuf(CopyFromState cstate)
#define ISOCTAL(c)
void ReceiveCopyBinaryHeader(CopyFromState cstate)
static int CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo, Oid typioparam, int32 typmod, bool *isnull)
static int GetDecimalFromHex(char hex)
void ReceiveCopyBegin(CopyFromState cstate)
#define IF_NEED_REFILL_AND_EOF_BREAK(extralen)
static int CopyReadAttributesText(CopyFromState cstate)
static const char BinarySignature[11]
#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen)
static bool CopyReadLine(CopyFromState cstate, bool is_csv)
static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
bool CopyFromBinaryOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
static void CopyConvertBuf(CopyFromState cstate)
bool NextCopyFrom(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
bool NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
@ COPY_FILE
Definition copyto.c:47
@ COPY_CALLBACK
Definition copyto.c:49
@ COPY_FRONTEND
Definition copyto.c:48
struct cursor * cur
Definition ecpg.c:29
int errcode_for_file_access(void)
Definition elog.c:897
int errcode(int sqlerrcode)
Definition elog.c:874
int errmsg(const char *fmt,...)
Definition elog.c:1093
int errhint(const char *fmt,...) pg_attribute_printf(1
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define NOTICE
Definition elog.h:35
#define ereport(elevel,...)
Definition elog.h:150
static Datum ExecEvalExpr(ExprState *state, ExprContext *econtext, bool *isNull)
Definition executor.h:393
#define ERRCODE_PROTOCOL_VIOLATION
Definition fe-connect.c:96
bool InputFunctionCallSafe(FmgrInfo *flinfo, char *str, Oid typioparam, int32 typmod, Node *escontext, Datum *result)
Definition fmgr.c:1585
Datum ReceiveFunctionCall(FmgrInfo *flinfo, StringInfo buf, Oid typioparam, int32 typmod)
Definition fmgr.c:1697
@ COPY_ON_ERROR_STOP
Definition copy.h:36
@ COPY_LOG_VERBOSITY_VERBOSE
Definition copy.h:48
#define COPY_HEADER_MATCH
Definition copy.h:26
#define COPY_HEADER_FALSE
Definition copy.h:27
long val
Definition informix.c:689
int i
Definition isn.c:77
#define pq_flush()
Definition libpq.h:49
#define PQ_SMALL_MESSAGE_LIMIT
Definition libpq.h:33
#define PQ_LARGE_MESSAGE_LIMIT
Definition libpq.h:34
int GetDatabaseEncoding(void)
Definition mbutils.c:1389
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition mbutils.c:1684
int pg_do_encoding_conversion_buf(Oid proc, int src_encoding, int dest_encoding, unsigned char *src, int srclen, unsigned char *dest, int destlen, bool noError)
Definition mbutils.c:478
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition mbutils.c:1826
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
#define HOLD_CANCEL_INTERRUPTS()
Definition miscadmin.h:142
#define RESUME_CANCEL_INTERRUPTS()
Definition miscadmin.h:144
int namestrcmp(Name name, const char *str)
Definition name.c:247
int16 attnum
FormData_pg_attribute * Form_pg_attribute
static char format
#define pg_ntoh32(x)
Definition pg_bswap.h:125
#define pg_ntoh16(x)
Definition pg_bswap.h:124
static int list_length(const List *l)
Definition pg_list.h:152
#define lfirst_int(lc)
Definition pg_list.h:173
static int list_nth_int(const List *list, int n)
Definition pg_list.h:310
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define MAX_CONVERSION_INPUT_LENGTH
Definition pg_wchar.h:320
static unsigned char pg_ascii_tolower(unsigned char ch)
Definition port.h:188
uint64_t Datum
Definition postgres.h:70
unsigned int Oid
int pq_getmessage(StringInfo s, int maxlen)
Definition pqcomm.c:1204
int pq_getbyte(void)
Definition pqcomm.c:964
void pq_startmsgread(void)
Definition pqcomm.c:1142
const char * pq_getmsgstring(StringInfo msg)
Definition pqformat.c:578
void pq_endmessage(StringInfo buf)
Definition pqformat.c:296
void pq_copymsgbytes(StringInfo msg, void *buf, int datalen)
Definition pqformat.c:527
void pq_beginmessage(StringInfo buf, char msgtype)
Definition pqformat.c:88
static void pq_sendbyte(StringInfo buf, uint8 byt)
Definition pqformat.h:160
static void pq_sendint16(StringInfo buf, uint16 i)
Definition pqformat.h:136
char * c
static int fb(int x)
char string[11]
#define PROGRESS_COPY_BYTES_PROCESSED
Definition progress.h:164
#define PqMsg_CopyDone
Definition protocol.h:64
#define PqMsg_CopyData
Definition protocol.h:65
#define PqMsg_CopyInResponse
Definition protocol.h:45
#define PqMsg_Sync
Definition protocol.h:27
#define PqMsg_CopyFail
Definition protocol.h:29
#define PqMsg_Flush
Definition protocol.h:24
#define RelationGetDescr(relation)
Definition rel.h:540
StringInfo makeStringInfo(void)
Definition stringinfo.c:72
void resetStringInfo(StringInfo str)
Definition stringinfo.c:126
void enlargeStringInfo(StringInfo str, int needed)
Definition stringinfo.c:337
int header_line
Definition copy.h:64
int default_print_len
Definition copy.h:70
int null_print_len
Definition copy.h:67
CopyLogVerbosityChoice log_verbosity
Definition copy.h:85
char * quote
Definition copy.h:72
CopyOnErrorChoice on_error
Definition copy.h:84
char * escape
Definition copy.h:73
char * null_print
Definition copy.h:66
char * delim
Definition copy.h:71
bool * force_notnull_flags
Definition copy.h:79
bool csv_mode
Definition copy.h:63
bool * force_null_flags
Definition copy.h:82
char * default_print
Definition copy.h:69
bool(* CopyFromOneRow)(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
Definition copyapi.h:96
copy_data_source_cb data_source_cb
const struct CopyFromRoutine * routine
StringInfoData line_buf
CopyFormatOptions opts
StringInfoData attribute_buf
const char * cur_attval
const char * cur_attname
ErrorSaveContext * escontext
MemoryContext ecxt_per_tuple_memory
Definition execnodes.h:283
Definition nodes.h:135
static FormData_pg_attribute * TupleDescAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:160
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
Definition wchar.c:2224
int pg_encoding_max_length(int encoding)
Definition wchar.c:2235