:PROPERTIES: :ID: 266d19c2-5ecd-48af-a9d4-4b0a7d3d5696 :END: #+title: 2022-03-29 * Get list of subscribers with non-normalized tags on active accounts :PROPERTIES: :header-args:sql: :engine postgresql :cmdline "-U postgres postgres" :dir /docker:postgres: :exports both :cache yes :END: Investigating the impact of the new [[id:d06d3ab4-c2d0-47c3-aae1-4395567fc3d2][Tag Normalization]] rules on existing subscribers on active accounts. ** Gathering data I imported a dump of the =subscriber_tags= table from [[id:dd113e53-6144-4cb2-a4aa-da3dc2e3e6ea][AppDB]] as well as the =list.subscribers= table data for all active accounts (~SELECT s.* FROM list.subscribers s JOIN accounts a ON (a.a_id = s.account_id) WHERE a.status_id < 7~) I then built a table of subscribers having tags that do not match our validation rules. #+begin_src sql :exports code :eval never CREATE TABLE invalid_tags AS SELECT s.list_id, s.account_id, t.subscriber_id, tag FROM subscribers s JOIN subscriber_tags as t ON (s.id = t.subscriber_id) , unnest(tags) tag WHERE tag != normalize_tag(tag) #+end_src ** Active accounts #+name: total-accounts #+begin_src sql :post humanize-numbers-in-results(results=*this*) SELECT COUNT(DISTINCT account_id) FROM subscribers #+end_src #+RESULTS[5d46a920e8c34c7fd755c2d1fd82ba567f8341d9]: total-accounts | count | |---------| | 103,357 | ** Subscribers on active accounts #+name: total-subscribers #+begin_src sql :post humanize-numbers-in-results(results=*this*) SELECT COUNT(id) FROM subscribers #+end_src #+RESULTS[17e3b468a66f03b74bdddd77044359dd8e4e92e3]: total-subscribers | count | |-------------| | 259,745,858 | ** Subscribers with invalid tags #+name: subscribers-with-invalid-tags #+begin_src sql :post humanize-numbers-in-results(results=*this*) SELECT COUNT(DISTINCT subscriber_id) FROM invalid_tags #+end_src #+RESULTS[b1e6beccd5a3bd4e503473844cb12e6d03d2c21d]: subscribers-with-invalid-tags | count | |-----------| | 1,331,220 | #+RESULTS: #+begin_src emacs-lisp "259,745,858" #+end_src #+header: :var filename="2022-03-29-subscribers-with-invalid-tags.png" #+header: :var total=total-subscribers[2,0] #+header: :var affected=subscribers-with-invalid-tags[2,0] #+begin_src python :exports results :results file import matplotlib.pyplot as plt plt.style.use("seaborn-pastel") total = int(str(total).replace(',', '')) affected = int(str(affected).replace(',', '')) fig1, ax1 = plt.subplots() ax1.pie( [affected, total - affected], explode=[0.5, 0], labels=[f"Affected", "Unaffected"], autopct="%1.1f%%", ) plt.savefig(filename) return filename #+end_src #+RESULTS: [[file:2022-03-29-subscribers-with-invalid-tags.png]] ** Accounts with subscribers with invalid tags #+name: accounts-with-invalid-tags #+begin_src sql :post humanize-numbers-in-results(results=*this*) SELECT COUNT(DISTINCT account_id) FROM invalid_tags; #+end_src #+RESULTS[0ee163f090d0e32e6a45b5b859948882ffe021d0]: accounts-with-invalid-tags | count | |-------| | 3,220 | #+header: :var filename="2022-03-29-accounts-with-invalid-tags.png" #+header: :var total=total-accounts[2,0] #+header: :var affected=accounts-with-invalid-tags[2,0] #+begin_src python :exports results :results file import matplotlib.pyplot as plt plt.style.use("seaborn-pastel") total = int(str(total).replace(',', '')) affected = int(str(affected).replace(',', '')) fig1, ax1 = plt.subplots() ax1.pie( [affected, total - affected], explode=[0.5, 0], labels=[f"Affected", "Unaffected"], autopct="%1.1f%%", ) plt.savefig(filename) return filename #+end_src #+RESULTS: [[file:2022-03-29-accounts-with-invalid-tags.png]] ** Normalized tag breakdown #+name: tag-breakdown #+begin_src sql :post humanize-numbers-in-results(results=*this*) SELECT 'Non-printable characters' AS "Rule" , COUNT(DISTINCT account_id) AS "Accounts" , COUNT(subscriber_id) AS "Subscribers" FROM invalid_tags WHERE tag ~ '[^[:print:]]' UNION SELECT 'Commas' AS "Rule" , COUNT(DISTINCT account_id) AS "Accounts" , COUNT(subscriber_id) AS "Subscribers" FROM invalid_tags WHERE tag ~ ',' UNION SELECT 'ASCII quotation marks' AS "Rule" , COUNT(DISTINCT account_id) AS "Accounts" , COUNT(subscriber_id) AS "Subscribers" FROM invalid_tags WHERE tag ~ '[''""]' UNION SELECT 'Unicode quotation marks' AS "Rule" , COUNT(DISTINCT account_id) AS "Accounts" , COUNT(subscriber_id) AS "Subscribers" FROM invalid_tags WHERE tag ~ '[‘’“”]' UNION SELECT 'Leading or trailing whitespace' AS "Rule" , COUNT(DISTINCT account_id) AS "Accounts" , COUNT(subscriber_id) AS "Subscribers" FROM invalid_tags WHERE TRIM(tag) != tag UNION SELECT 'Repeated whitespace' AS "Rule" , COUNT(DISTINCT account_id) AS "Accounts" , COUNT(subscriber_id) AS "Subscribers" FROM invalid_tags WHERE TRIM(tag) ~ '[:space:]{2,}' UNION SELECT 'Upper-case characters' AS "Rule" , COUNT(DISTINCT account_id) AS "Accounts" , COUNT(subscriber_id) AS "Subscribers" FROM invalid_tags WHERE LOWER(tag) != tag #+end_src #+RESULTS[853c04719cf6fd9c329359c1b72e8198393322b9]: tag-breakdown | Rule | Accounts | Subscribers | |--------------------------------+----------+-------------| | Leading or trailing whitespace | 119 | 66,788 | | Repeated whitespace | 2,404 | 1,234,651 | | Unicode quotation marks | 126 | 21,343 | | Commas | 378 | 54,567 | | ASCII quotation marks | 2,507 | 1,544,607 | | Upper-case characters | 0 | 0 | | Non-printable characters | 58 | 1,749 | #+header: :var filename="2022-03-29-invalid-tag-breakdown.png" #+header: :var breakdown=tag-breakdown #+begin_src python :exports results :results file import matplotlib.pyplot as plt plt.style.use("seaborn-pastel") accounts = sorted( [(int(str(row[1]).replace(",", "")), row[0]) for row in breakdown], reverse=True, ) account_total = sum(a[0] for a in accounts) subscribers = sorted( [(int(str(row[2]).replace(",", "")), row[0]) for row in breakdown], reverse=True, ) subscriber_total = sum(s[0] for s in subscribers) fig1, axs = plt.subplots(2, 2) axs[0, 0].set_title("Accounts") wedges1, _ = axs[0, 0].pie( [a[0] for a in accounts], explode=[0.1 for _ in accounts], ) axs[1, 0].axis("off") axs[1, 0].legend( wedges1, ["{:.1f}% {}".format(a[0] * 100.0 / account_total, a[1]) for a in accounts], fontsize=8, ) axs[0, 1].set_title("Subscribers") wedges2, _ = axs[0, 1].pie( [s[0] for s in subscribers], explode=[0.1 for _ in subscribers], ) axs[1, 1].axis("off") axs[1, 1].legend( wedges2, ["{:.1f}% {}".format(s[0] * 100.0 / subscriber_total, s[1]) for s in subscribers], fontsize=8, ) plt.savefig(filename) return filename #+end_src #+RESULTS: [[file:2022-03-29-invalid-tag-breakdown.png]]