227 lines
7.1 KiB
Org Mode
227 lines
7.1 KiB
Org Mode
:PROPERTIES:
|
|
:ID: 266d19c2-5ecd-48af-a9d4-4b0a7d3d5696
|
|
:END:
|
|
#+title: 2022-03-29
|
|
* Get list of subscribers with non-normalized tags on active accounts
|
|
:PROPERTIES:
|
|
:header-args:sql: :engine postgresql :cmdline "-U postgres postgres" :dir /docker:postgres: :exports both :cache yes
|
|
:END:
|
|
|
|
Investigating the impact of the new [[id:d06d3ab4-c2d0-47c3-aae1-4395567fc3d2][Tag Normalization]] rules on existing
|
|
subscribers on active accounts.
|
|
|
|
** Gathering data
|
|
I imported a dump of the =subscriber_tags= table from [[id:dd113e53-6144-4cb2-a4aa-da3dc2e3e6ea][AppDB]] as well as the
|
|
=list.subscribers= table data for all active accounts (~SELECT s.* FROM
|
|
list.subscribers s JOIN accounts a ON (a.a_id = s.account_id) WHERE a.status_id
|
|
< 7~)
|
|
|
|
I then built a table of subscribers having tags that do not match our validation
|
|
rules.
|
|
|
|
#+begin_src sql :exports code :eval never
|
|
CREATE TABLE invalid_tags AS
|
|
SELECT s.list_id, s.account_id, t.subscriber_id, tag
|
|
FROM subscribers s
|
|
JOIN subscriber_tags as t ON (s.id = t.subscriber_id)
|
|
, unnest(tags) tag
|
|
WHERE tag != normalize_tag(tag)
|
|
#+end_src
|
|
|
|
** Active accounts
|
|
#+name: total-accounts
|
|
#+begin_src sql :post humanize-numbers-in-results(results=*this*)
|
|
SELECT COUNT(DISTINCT account_id) FROM subscribers
|
|
#+end_src
|
|
|
|
#+RESULTS[5d46a920e8c34c7fd755c2d1fd82ba567f8341d9]: total-accounts
|
|
| count |
|
|
|---------|
|
|
| 103,357 |
|
|
|
|
** Subscribers on active accounts
|
|
#+name: total-subscribers
|
|
#+begin_src sql :post humanize-numbers-in-results(results=*this*)
|
|
SELECT COUNT(id) FROM subscribers
|
|
#+end_src
|
|
|
|
#+RESULTS[17e3b468a66f03b74bdddd77044359dd8e4e92e3]: total-subscribers
|
|
| count |
|
|
|-------------|
|
|
| 259,745,858 |
|
|
|
|
** Subscribers with invalid tags
|
|
#+name: subscribers-with-invalid-tags
|
|
#+begin_src sql :post humanize-numbers-in-results(results=*this*)
|
|
SELECT COUNT(DISTINCT subscriber_id) FROM invalid_tags
|
|
#+end_src
|
|
|
|
#+RESULTS[b1e6beccd5a3bd4e503473844cb12e6d03d2c21d]: subscribers-with-invalid-tags
|
|
| count |
|
|
|-----------|
|
|
| 1,331,220 |
|
|
|
|
#+RESULTS:
|
|
#+begin_src emacs-lisp
|
|
"259,745,858"
|
|
#+end_src
|
|
|
|
#+header: :var filename="2022-03-29-subscribers-with-invalid-tags.png"
|
|
#+header: :var total=total-subscribers[2,0]
|
|
#+header: :var affected=subscribers-with-invalid-tags[2,0]
|
|
#+begin_src python :exports results :results file
|
|
import matplotlib.pyplot as plt
|
|
|
|
plt.style.use("seaborn-pastel")
|
|
|
|
total = int(str(total).replace(',', ''))
|
|
affected = int(str(affected).replace(',', ''))
|
|
|
|
fig1, ax1 = plt.subplots()
|
|
ax1.pie(
|
|
[affected, total - affected],
|
|
explode=[0.5, 0],
|
|
labels=[f"Affected", "Unaffected"],
|
|
autopct="%1.1f%%",
|
|
)
|
|
plt.savefig(filename)
|
|
return filename
|
|
#+end_src
|
|
|
|
#+RESULTS:
|
|
[[file:2022-03-29-subscribers-with-invalid-tags.png]]
|
|
|
|
** Accounts with subscribers with invalid tags
|
|
#+name: accounts-with-invalid-tags
|
|
#+begin_src sql :post humanize-numbers-in-results(results=*this*)
|
|
SELECT COUNT(DISTINCT account_id) FROM invalid_tags;
|
|
#+end_src
|
|
|
|
#+RESULTS[0ee163f090d0e32e6a45b5b859948882ffe021d0]: accounts-with-invalid-tags
|
|
| count |
|
|
|-------|
|
|
| 3,220 |
|
|
|
|
#+header: :var filename="2022-03-29-accounts-with-invalid-tags.png"
|
|
#+header: :var total=total-accounts[2,0]
|
|
#+header: :var affected=accounts-with-invalid-tags[2,0]
|
|
#+begin_src python :exports results :results file
|
|
import matplotlib.pyplot as plt
|
|
|
|
plt.style.use("seaborn-pastel")
|
|
|
|
total = int(str(total).replace(',', ''))
|
|
affected = int(str(affected).replace(',', ''))
|
|
|
|
fig1, ax1 = plt.subplots()
|
|
ax1.pie(
|
|
[affected, total - affected],
|
|
explode=[0.5, 0],
|
|
labels=[f"Affected", "Unaffected"],
|
|
autopct="%1.1f%%",
|
|
)
|
|
plt.savefig(filename)
|
|
return filename
|
|
#+end_src
|
|
|
|
#+RESULTS:
|
|
[[file:2022-03-29-accounts-with-invalid-tags.png]]
|
|
|
|
** Normalized tag breakdown
|
|
#+name: tag-breakdown
|
|
#+begin_src sql :post humanize-numbers-in-results(results=*this*)
|
|
SELECT 'Non-printable characters' AS "Rule"
|
|
, COUNT(DISTINCT account_id) AS "Accounts"
|
|
, COUNT(subscriber_id) AS "Subscribers"
|
|
FROM invalid_tags
|
|
WHERE tag ~ '[^[:print:]]'
|
|
UNION SELECT 'Commas' AS "Rule"
|
|
, COUNT(DISTINCT account_id) AS "Accounts"
|
|
, COUNT(subscriber_id) AS "Subscribers"
|
|
FROM invalid_tags
|
|
WHERE tag ~ ','
|
|
UNION SELECT 'ASCII quotation marks' AS "Rule"
|
|
, COUNT(DISTINCT account_id) AS "Accounts"
|
|
, COUNT(subscriber_id) AS "Subscribers"
|
|
FROM invalid_tags
|
|
WHERE tag ~ '[''""]'
|
|
UNION SELECT 'Unicode quotation marks' AS "Rule"
|
|
, COUNT(DISTINCT account_id) AS "Accounts"
|
|
, COUNT(subscriber_id) AS "Subscribers"
|
|
FROM invalid_tags
|
|
WHERE tag ~ '[‘’“”]'
|
|
UNION SELECT 'Leading or trailing whitespace' AS "Rule"
|
|
, COUNT(DISTINCT account_id) AS "Accounts"
|
|
, COUNT(subscriber_id) AS "Subscribers"
|
|
FROM invalid_tags
|
|
WHERE TRIM(tag) != tag
|
|
UNION SELECT 'Repeated whitespace' AS "Rule"
|
|
, COUNT(DISTINCT account_id) AS "Accounts"
|
|
, COUNT(subscriber_id) AS "Subscribers"
|
|
FROM invalid_tags
|
|
WHERE TRIM(tag) ~ '[:space:]{2,}'
|
|
UNION SELECT 'Upper-case characters' AS "Rule"
|
|
, COUNT(DISTINCT account_id) AS "Accounts"
|
|
, COUNT(subscriber_id) AS "Subscribers"
|
|
FROM invalid_tags
|
|
WHERE LOWER(tag) != tag
|
|
#+end_src
|
|
|
|
#+RESULTS[853c04719cf6fd9c329359c1b72e8198393322b9]: tag-breakdown
|
|
| Rule | Accounts | Subscribers |
|
|
|--------------------------------+----------+-------------|
|
|
| Leading or trailing whitespace | 119 | 66,788 |
|
|
| Repeated whitespace | 2,404 | 1,234,651 |
|
|
| Unicode quotation marks | 126 | 21,343 |
|
|
| Commas | 378 | 54,567 |
|
|
| ASCII quotation marks | 2,507 | 1,544,607 |
|
|
| Upper-case characters | 0 | 0 |
|
|
| Non-printable characters | 58 | 1,749 |
|
|
|
|
#+header: :var filename="2022-03-29-invalid-tag-breakdown.png"
|
|
#+header: :var breakdown=tag-breakdown
|
|
#+begin_src python :exports results :results file
|
|
import matplotlib.pyplot as plt
|
|
|
|
plt.style.use("seaborn-pastel")
|
|
|
|
accounts = sorted(
|
|
[(int(str(row[1]).replace(",", "")), row[0]) for row in breakdown],
|
|
reverse=True,
|
|
)
|
|
account_total = sum(a[0] for a in accounts)
|
|
subscribers = sorted(
|
|
[(int(str(row[2]).replace(",", "")), row[0]) for row in breakdown],
|
|
reverse=True,
|
|
)
|
|
subscriber_total = sum(s[0] for s in subscribers)
|
|
|
|
fig1, axs = plt.subplots(2, 2)
|
|
axs[0, 0].set_title("Accounts")
|
|
wedges1, _ = axs[0, 0].pie(
|
|
[a[0] for a in accounts],
|
|
explode=[0.1 for _ in accounts],
|
|
)
|
|
axs[1, 0].axis("off")
|
|
axs[1, 0].legend(
|
|
wedges1,
|
|
["{:.1f}% {}".format(a[0] * 100.0 / account_total, a[1]) for a in accounts],
|
|
fontsize=8,
|
|
)
|
|
axs[0, 1].set_title("Subscribers")
|
|
wedges2, _ = axs[0, 1].pie(
|
|
[s[0] for s in subscribers],
|
|
explode=[0.1 for _ in subscribers],
|
|
)
|
|
axs[1, 1].axis("off")
|
|
axs[1, 1].legend(
|
|
wedges2,
|
|
["{:.1f}% {}".format(s[0] * 100.0 / subscriber_total, s[1]) for s in subscribers],
|
|
fontsize=8,
|
|
)
|
|
plt.savefig(filename)
|
|
return filename
|
|
#+end_src
|
|
|
|
#+RESULTS:
|
|
[[file:2022-03-29-invalid-tag-breakdown.png]]
|