mirror of
https://github.com/correl/mage.git
synced 2024-11-14 11:09:31 +00:00
python script added to remove duplicates from mtg-cards-data.txt. backed up old version with _old. deduped file is half the size
This commit is contained in:
parent
0ffeb2d5b6
commit
8f2eda579e
3 changed files with 57794 additions and 28094 deletions
43
Utils/de-dup-cards-data.py
Normal file
43
Utils/de-dup-cards-data.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
"""
|
||||
Purpose: Removes duplicate CardName|CardSet|CardNumber| entries from mtg-cards-data.txt that crop up
|
||||
|
||||
@author: escplan9 (Derek Monturo - dmontur1 at gmail dot com)
|
||||
@version: 1.0
|
||||
|
||||
Written in Python 3.x, should work in Python 2.x as well.
|
||||
"""
|
||||
import re
|
||||
|
||||
"""
|
||||
example line from file:
|
||||
Aven Mimeomancer|Alara Reborn|2|R|{1}{W}{U}|Creature - Bird Wizard|3|1|Flying$At the beginning of your upkeep, you may put a feather counter on target creature. If you do, that creature is 3/1 and has flying for as long as it has a feather counter on it.|
|
||||
|
||||
With the reg-ex pattern below, separates into 2 match groups
|
||||
match-group-1: Aven Mimeomancer|Alara Reborn|2|
|
||||
match-group-2: (remainder of the line up to the \r\n)
|
||||
"""
|
||||
reg_ptn = re.compile(r'([a-zA-Z].*[|][a-zA-Z].*[|]\d+[|])(.*[\r\n])')
|
||||
orig_txt_filename = 'mtg-cards-data.txt'
|
||||
|
||||
card_dict = {}
|
||||
new_contents = ''
|
||||
|
||||
with open(orig_txt_filename) as orig_txt_file:
|
||||
for line in orig_txt_file:
|
||||
matchObj = re.match(reg_ptn, line)
|
||||
if matchObj is not None:
|
||||
matchGroups = matchObj.groups()
|
||||
if len(matchGroups) > 0:
|
||||
card_set_num = matchGroups[0]
|
||||
if card_set_num not in card_dict.keys(): # only add unique card-set-number entries to new-contents
|
||||
card_dict[card_set_num] = True
|
||||
new_contents += line
|
||||
else:
|
||||
new_contents += line
|
||||
else:
|
||||
new_contents += line
|
||||
|
||||
# generate new file with the de-duped contents
|
||||
new_txt_filename = 'unduped-cards-data.txt'
|
||||
with open(new_txt_filename, "w") as new_txt_file:
|
||||
new_txt_file.write(new_contents)
|
28094
Utils/mtg-cards-data.txt
28094
Utils/mtg-cards-data.txt
File diff suppressed because it is too large
Load diff
57751
Utils/mtg-cards-data_old.txt
Normal file
57751
Utils/mtg-cards-data_old.txt
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue