Source code for parce.lang.csv

# -*- coding: utf-8 -*-
#
# This file is part of the parce Python package.
#
# Copyright © 2021-2021 by Wilbert Berendsen <info@wilbertberendsen.nl>
#
# This module is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This module is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
RFC-4180 compliant CSV format
"""

__all__ = ('Csv', 'CsvTransform')

import re

from parce import Language, lexicon, skip, default_action, default_target
from parce.rule import bygroup
from parce.transform import Transform
from parce.util import split_list
import parce.action as a


[docs]class Csv(Language): """RFC-4180 compliant CSV format.""" @lexicon def root(cls): """Split a file in records.""" yield default_target, cls.record @lexicon(re_flags=re.MULTILINE) def record(cls): """Split a record in escaped (string) and non-escaped fields.""" yield r'$\n?', skip, -1 yield r'[^,"\n]+(?=$|,|\n)', a.Name yield r'[ \t]*((?:[^,"\s]+[ \t]*)+)?(")', bygroup(a.Invalid, a.String.Start), cls.string yield ',', a.Separator @lexicon(consume=True) def string(cls): """Handle a quoted string, escaping doubled quotes inside.""" yield r'""', a.String.Escape yield r'(")[ \t]*([^,"\s]+)?', bygroup(a.String.End, a.Invalid), -1 yield default_action, a.String
[docs]class CsvTransform(Transform): r"""Transform for comma-separated values, that creates a list of tuples. For example:: >>> import parce.transform >>> parce.transform.transform_text(parce.find('csv'), 'a,b,,c\nd,"",e,"x,y,z"') [('a', 'b', None, 'c'), ('d', '', 'e', 'x,y,z')] """ def _interpret(self, token): """Reimplement to interpret a text value differently, e.g. a number.""" return token.text
[docs] def root(self, items): """Return the list of records.""" return [i.obj for i in items]
[docs] def record(self, items): """Return the tuple of the fields of one record. Adjacent commas yield None, but empty quoted strings (``""``) are returned as empty strings. """ return tuple( None if not l else self._interpret(l[0]) if l[0].is_token else l[0].obj for l in split_list(items, ','))
[docs] def string(self, items): """Return a string comprising the contents of the quoted string. Handles doubled quotes inside, and does not add the outer quotes. """ start, end = 0, len(items) - 1 while items[start].action in (a.Invalid, a.String.Start): start += 1 while end >= start and items[end].action in (a.String.End, a.Invalid): end -= 1 return ''.join( '"' if t.action is a.String.Escape else t.text for t in items[start:end+1])