ccoop_resplit.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. #! /usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. import os, sys, json
  4. import csv
  5. from collections import OrderedDict
  6. from datetime import datetime
  7. import hashlib
  8. import locale
  9. locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')
  10. class CsvStatementParser(object):
  11. def __init__(self):
  12. self.lines = OrderedDict()
  13. self.fieldnames = None
  14. self.date_fieldname = "Date"
  15. self.overlap_detector = {}
  16. self.first_ops = {}
  17. self.last_ops = {}
  18. self.daterange = [datetime.now(), datetime.fromordinal(1)]
  19. def parse(self, filename):
  20. with open(filename, encoding='iso-8859-1') as csvfile:
  21. reader = csv.DictReader(csvfile, delimiter=';')
  22. if self.fieldnames is None:
  23. # Le premier fichier parcourru détermine les noms de
  24. # colonnes attendus dans les prochains fichiers.
  25. self.fieldnames = [k for k in reader.fieldnames if k != '']
  26. # On identifie également la permière colonne qui
  27. # ressemble à une date, elle servira ensuite de clef
  28. # d'indexation.
  29. for fname in self.fieldnames:
  30. if "date" in fname.lower():
  31. self.date_fieldname = fname
  32. break
  33. if self.fieldnames != [k for k in reader.fieldnames if k != '']:
  34. print("""Fichier ignoré : %s. Cause: does not have the expected column names.
  35. Found: %s
  36. Expected: %s
  37. """ % (filename, ",".join(reader.fieldnames), ",".join(self.fieldnames)))
  38. else:
  39. self._parse_file(filename, reader)
  40. def _parse_file(self, filename, reader):
  41. print("Lecture du fichier %s" % os.path.basename(filename))
  42. for row in reader:
  43. opdate = datetime.strptime(row[self.date_fieldname], '%d/%m/%Y')
  44. ophash = datetime.strftime(opdate, '%Y-%m-%d') + hashlib.md5(json.dumps(row).encode()).hexdigest()
  45. self.lines[ophash] = {k:v for k,v in row.items() if k != ''}
  46. # Adjust dateranges
  47. if opdate < self.daterange[0]:
  48. self.daterange[0] = opdate
  49. if opdate > self.daterange[1]:
  50. self.daterange[1] = opdate
  51. # Prepare overlap detection
  52. if ophash not in self.overlap_detector:
  53. self.overlap_detector[ophash] = set()
  54. self.overlap_detector[ophash].add(filename)
  55. # Remember first line of each CSV file
  56. if filename not in self.first_ops:
  57. self.first_ops[filename] = ophash
  58. # Remember first line of each CSV file
  59. if filename not in self.last_ops:
  60. self.last_ops[filename] = ophash
  61. # CSV files are sometimes sorted by date ASC and sometimes
  62. # sorted by date DESC. So we may need to swap first_op and last_op.
  63. if (int(self.first_ops[filename][0:10].replace('-', '')) > int(self.last_ops[filename][0:10].replace('-', ''))):
  64. tmp = self.first_ops[filename]
  65. self.first_ops[filename] = self.last_ops[filename]
  66. self.last_ops[filename] = tmp
  67. def dump_full(self, output_filename):
  68. with open(output_filename, 'w') as outfile:
  69. writer = csv.DictWriter(outfile, self.fieldnames, delimiter=';')
  70. writer.writeheader()
  71. for line in reversed(sorted(self.lines.items())):
  72. writer.writerow(line[1])
  73. print("Relevé intégral généré dans le fichier %s" % os.path.abspath(output_filename))
  74. def dump_monthly_reports(self, outputdir):
  75. firstmonth = int('{:%Y%m}'.format(self.daterange[0])) + 1
  76. lastmonth = int('{:%Y%m}'.format(self.daterange[1])) - 1
  77. if firstmonth >= lastmonth:
  78. print("Impossible de générer des relevés mensuels car la plage de dates traitée est trop petite.")
  79. return
  80. curmonth = firstmonth
  81. def __openfile__(curmonth):
  82. fname = "releve_{0}.csv".format(curmonth)
  83. outfile = open(os.path.join(outputdir, fname), 'w')
  84. writer = csv.DictWriter(outfile, self.fieldnames, delimiter=';')
  85. writer.writeheader()
  86. return outfile, writer
  87. outfile, writer = __openfile__(curmonth)
  88. writer = csv.DictWriter(outfile, self.fieldnames, delimiter=';')
  89. for line in sorted(self.lines.items()):
  90. month = int(line[0][0:4] + line[0][5:7])
  91. if month < curmonth:
  92. continue
  93. if month > lastmonth:
  94. break
  95. if month > curmonth:
  96. outfile.close()
  97. curmonth = month
  98. if month in self.badmonths:
  99. outfile, writer = __openfile__(str(curmonth) + "_potentiellement_incomplet")
  100. else:
  101. outfile, writer = __openfile__(curmonth)
  102. writer.writerow(line[1])
  103. outfile.close()
  104. print("Relevés mensuels générés dans le dossier %s" % os.path.abspath(outputdir))
  105. def check_overlaps(self):
  106. """
  107. Helps finding possible missing operations if exported CSV files
  108. are not "contiguous".
  109. """
  110. self.badmonths = set()
  111. print("\nRecherche de chevauchements (les chevauchements de fichiers CSV c'est bien, ça rassure)...")
  112. for filename, first_op in self.first_ops.items():
  113. if first_op in self.overlap_detector:
  114. otherfiles = [v for v in self.overlap_detector.get(first_op)]
  115. otherfiles.remove(filename)
  116. if len(otherfiles) > 0:
  117. # Eliminate files having the same first_op
  118. otherfiles[:] = [candidate for candidate in otherfiles if self.first_ops[candidate] != first_op]
  119. if len(otherfiles) == 0 and first_op[0:10] != "{0:%Y-%m-%d}".format(self.daterange[0]):
  120. self.badmonths.add(int(first_op[0:7].replace('-', '')))
  121. print("Attention. Il y a peut-être des écritures manquantes avant le %s (fichier %s)." % (first_op[0:10], os.path.basename(filename)))
  122. for filename, last_op in self.last_ops.items():
  123. if last_op in self.overlap_detector:
  124. otherfiles = [v for v in self.overlap_detector.get(last_op)]
  125. otherfiles.remove(filename)
  126. if len(otherfiles) > 0:
  127. # Eliminate files having the same last_op
  128. otherfiles[:] = [candidate for candidate in otherfiles if self.last_ops[candidate] != last_op]
  129. if len(otherfiles) == 0:
  130. self.badmonths.add(int(last_op[0:7].replace('-', '')))
  131. if last_op[0:10] != "{0:%Y-%m-%d}".format(self.daterange[1]):
  132. print("Attention. Il y a peut-être des écritures manquantes après le %s (fichier %s)." % (last_op[0:10], os.path.basename(filename)))
  133. print("")
  134. def start_cli(dirpath, outputdir):
  135. # Lecture des fichiers CSV présents dans le dossier
  136. p = CsvStatementParser()
  137. for f in sorted(os.listdir(dirpath)):
  138. if f.endswith('.csv') or f.endswith('.CSV'):
  139. p.parse(os.path.join(dirpath, f))
  140. print("Les écritures lues s'étalent entre le {0:%d %B %Y} et le {1:%d %B %Y}.".format(p.daterange[0], p.daterange[1]))
  141. # Recherche de chevauchements
  142. p.check_overlaps()
  143. # Générer un relevé intégral et des relevés mensuels
  144. suffix = "_{0:%Y-%m-%d}__{1:%Y-%m-%d}".format(p.daterange[0], p.daterange[1])
  145. if len(p.badmonths): suffix += "_avec_des_trous"
  146. p.dump_full(os.path.join(outputdir, "integral%s.csv" % suffix))
  147. p.dump_monthly_reports(outputdir)
  148. if __name__ == '__main__':
  149. if len(sys.argv) < 2:
  150. print("Erreur. Merci de préciser le chemin du dossier où se trouvent les fichiers CSV à analyser.")
  151. sys.exit(1)
  152. inputdir = sys.argv[1]
  153. if len(sys.argv) > 2:
  154. outputdir = sys.argv[2]
  155. else:
  156. outputdir = os.path.join(inputdir, "outputdir")
  157. # Création d'un dossier output si besoin
  158. if not os.path.isdir(outputdir):
  159. os.makedirs(outputdir)
  160. start_cli(inputdir, outputdir)