From 929d982e38a2775e66543e04b57055889f0efb20 Mon Sep 17 00:00:00 2001 From: North Date: Sat, 23 Jul 2011 19:18:37 +0300 Subject: [PATCH] Added the GathererCrawler project (used for generating mtg-cards-data.txt) --- Utils/GathererCrawler/pom.xml | 25 +++ .../main/java/north/gatherercrawler/Card.java | 197 ++++++++++++++++++ .../north/gatherercrawler/CardParser.java | 135 ++++++++++++ .../java/north/gatherercrawler/CardsList.java | 26 +++ .../main/java/north/gatherercrawler/Main.java | 193 +++++++++++++++++ .../north/gatherercrawler/ParseQueue.java | 29 +++ .../north/gatherercrawler/ParsedList.java | 25 +++ .../north/gatherercrawler/ThreadStarter.java | 120 +++++++++++ 8 files changed, 750 insertions(+) create mode 100644 Utils/GathererCrawler/pom.xml create mode 100644 Utils/GathererCrawler/src/main/java/north/gatherercrawler/Card.java create mode 100644 Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardParser.java create mode 100644 Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardsList.java create mode 100644 Utils/GathererCrawler/src/main/java/north/gatherercrawler/Main.java create mode 100644 Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParseQueue.java create mode 100644 Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParsedList.java create mode 100644 Utils/GathererCrawler/src/main/java/north/gatherercrawler/ThreadStarter.java diff --git a/Utils/GathererCrawler/pom.xml b/Utils/GathererCrawler/pom.xml new file mode 100644 index 0000000000..388034ca80 --- /dev/null +++ b/Utils/GathererCrawler/pom.xml @@ -0,0 +1,25 @@ + + + 4.0.0 + + ro.trp + GathererCrawler + 1.0 + jar + + GathererCrawler + http://maven.apache.org + + + UTF-8 + + + + + org.jsoup + jsoup + 1.5.2 + + + diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Card.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Card.java new file mode 100644 index 0000000000..b486a24a63 --- /dev/null +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Card.java @@ -0,0 +1,197 @@ +package north.gatherercrawler; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * + * @author North + */ +public class Card implements Comparable { + + private Integer multiverseId; + private String name; + private List manaCost; + private Integer convertedManaCost; + private String types; + private List cardText; + private String flavorText; + private String powerToughness; + private String expansion; + private String rarity; + private String cardNumber; + private String artist; + + public Card(Integer multiverseId) { + this.multiverseId = multiverseId; + } + + public Card(String card) { + String[] split = card.split("\\|",13); + if (split[0].length() > 0) { + multiverseId = Integer.parseInt(split[0]); + } + if (split[1].length() > 0) { + name = split[1]; + } + manaCost = new ArrayList(); + manaCost.addAll(Arrays.asList(split[2].split("\\$"))); + if (split[3].length() > 0) { + convertedManaCost = Integer.parseInt(split[3]); + } + if (split[4].length() > 0) { + types = split[4]; + } + cardText = new ArrayList(); + cardText.addAll(Arrays.asList(split[5].split("\\$"))); + if (split[6].length() > 0) { + flavorText = split[6]; + } + if (split[7].length() > 0) { + powerToughness = split[7]; + } + if (split[8].length() > 0) { + expansion = split[8]; + } + if (split[9].length() > 0) { + rarity = split[9]; + } + if (split[10].length() > 0) { + cardNumber = split[10]; + } + if (split[11].length() > 0) { + artist = split[11]; + } + } + + public String getArtist() { + return artist; + } + + public void setArtist(String artist) { + this.artist = artist; + } + + public String getCardNumber() { + return cardNumber; + } + + public void setCardNumber(String cardNumber) { + this.cardNumber = cardNumber; + } + + public List getCardText() { + return cardText; + } + + public void setCardText(List cardText) { + this.cardText = cardText; + } + + public Integer getConvertedManaCost() { + return convertedManaCost; + } + + public void setConvertedManaCost(Integer convertedManaCost) { + this.convertedManaCost = convertedManaCost; + } + + public String getExpansion() { + return expansion; + } + + public void setExpansion(String expansion) { + this.expansion = expansion; + } + + public String getFlavorText() { + return flavorText; + } + + public void setFlavorText(String flavorText) { + this.flavorText = flavorText; + } + + public List getManaCost() { + return manaCost; + } + + public void setManaCost(List manaCost) { + this.manaCost = manaCost; + } + + public Integer getMultiverseId() { + return multiverseId; + } + + public void setMultiverseId(Integer multiverseId) { + this.multiverseId = multiverseId; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getPowerToughness() { + return powerToughness; + } + + public void setPowerToughness(String powerToughness) { + this.powerToughness = powerToughness; + } + + public String getRarity() { + return rarity; + } + + public void setRarity(String rarity) { + this.rarity = rarity; + } + + public String getTypes() { + return types; + } + + public void setTypes(String types) { + this.types = types; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(multiverseId).append("|"); + sb.append(name).append("|"); + for (int i = 0; i < manaCost.size(); i++) { + sb.append(manaCost.get(i)); + if (i < manaCost.size() - 1) { + sb.append("$"); + } + } + sb.append("|"); + sb.append(convertedManaCost != null ? convertedManaCost : "").append("|"); + sb.append(types).append("|"); + for (int i = 0; i < cardText.size(); i++) { + sb.append(cardText.get(i)); + if (i < cardText.size() - 1) { + sb.append("$"); + } + } + sb.append("|"); + sb.append(flavorText != null ? flavorText : "").append("|"); + sb.append(powerToughness != null ? powerToughness : "").append("|"); + sb.append(expansion).append("|"); + sb.append(rarity != null ? rarity : "").append("|"); + sb.append(cardNumber != null ? cardNumber : "").append("|"); + sb.append(artist != null ? artist : ""); + return sb.toString(); + } + + public int compareTo(Card o) { + return this.multiverseId.compareTo(o.getMultiverseId()); + } +} diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardParser.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardParser.java new file mode 100644 index 0000000000..b93b2171aa --- /dev/null +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardParser.java @@ -0,0 +1,135 @@ +package north.gatherercrawler; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import org.jsoup.Connection; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +/** + * + * @author robert.biter + */ +public class CardParser extends Thread { + + private boolean parseCard(Integer multiverseId) { + String url = "http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid=" + multiverseId; + Card card = new Card(multiverseId); + Document doc = null; + int retries = 30; + boolean done = false; + while (retries > 0 && !done) { + try { + Connection connection = Jsoup.connect(url); + connection.timeout(20000); + doc = connection.get(); + } catch (IOException ex) { + } + done = true; + } + if (!done) { + System.out.println("Card get exception: " + multiverseId); + return false; + } + + try { + Elements select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value"); + if (!select.isEmpty()) { + card.setName(select.get(0).text().trim()); + } + + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_manaRow .value img"); + List manaCost = new ArrayList(); + if (!select.isEmpty()) { + for (Element element : select) { + manaCost.add(element.attr("src").replace("/Handlers/Image.ashx?size=medium&name=", "").replace("&type=symbol", "").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", "")); + } + } + card.setManaCost(manaCost); + + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cmcRow .value"); + if (!select.isEmpty()) { + card.setConvertedManaCost(Integer.parseInt(select.get(0).text().trim())); + } + + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_typeRow .value"); + if (!select.isEmpty()) { + card.setTypes(select.get(0).text().trim()); + } + + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_textRow .value .cardtextbox"); + List cardText = new ArrayList(); + if (!select.isEmpty()) { + for (Element element : select) { + cardText.add(element.html().trim().replace("\"[\\d\\w\\s]+?\"", "").replace("\n", "")); + } + } + card.setCardText(cardText); + + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_FlavorText .cardtextbox i"); + if (!select.isEmpty()) { + card.setFlavorText(select.get(0).text().trim()); + } + + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow .value"); + if (!select.isEmpty()) { + card.setPowerToughness(select.get(0).text().trim()); + } + + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_currentSetSymbol a"); + if (!select.isEmpty()) { + card.setExpansion(select.get(1).text().trim()); + } + + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rarityRow .value span"); + if (!select.isEmpty()) { + card.setRarity(select.get(0).text().trim()); + } + + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_otherSetsValue a"); + List otherSets = new ArrayList(); + if (!select.isEmpty()) { + for (Element element : select) { + otherSets.add(Integer.parseInt(element.attr("href").replace("Details.aspx?multiverseid=", ""))); + } + } +// card.setOtherSets(otherSets); + for (Integer otherSet : otherSets) { + if (!ParsedList.contains(otherSet)) { + ParseQueue.add(otherSet); + } + } + + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_numberRow .value"); + if (!select.isEmpty()) { + card.setCardNumber(select.get(0).text().trim()); + } + + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ArtistCredit a"); + if (!select.isEmpty()) { + card.setArtist(select.get(0).text().trim()); + } + } catch (Exception e) { + return false; + } + + CardsList.add(card); + return true; + } + + @Override + public void run() { + while (!ParseQueue.isEmpty()) { + Integer multiverseId = ParseQueue.remove(); + if (!ParsedList.contains(multiverseId)) { + ParsedList.add(multiverseId); + parseCard(multiverseId); + } + } + + ThreadStarter.threadDone(); + } +} diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardsList.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardsList.java new file mode 100644 index 0000000000..41bf308e03 --- /dev/null +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardsList.java @@ -0,0 +1,26 @@ +package north.gatherercrawler; + +import java.util.Iterator; +import java.util.concurrent.ConcurrentSkipListSet; + +/** + * + * @author North + */ +public class CardsList { + + private static final CardsList instance = new CardsList(); + private ConcurrentSkipListSet list; + + public CardsList() { + list = new ConcurrentSkipListSet(); + } + + public static void add(Card element) { + instance.list.add(element); + } + + public static Iterator iterator() { + return instance.list.iterator(); + } +} diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Main.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Main.java new file mode 100644 index 0000000000..32c7fc3e14 --- /dev/null +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Main.java @@ -0,0 +1,193 @@ +package north.gatherercrawler; + +import java.io.BufferedReader; +import java.io.DataInputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; +import org.jsoup.Connection; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +/** + * + * @author robert.biter + */ +public class Main { + + private static void readCardsFromFile() { + try { + // Open the file + FileInputStream fstream = new FileInputStream("cards-data.txt"); + // Get the object of DataInputStream + DataInputStream in = new DataInputStream(fstream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; + //Read File Line By Line + while ((strLine = br.readLine()) != null) { + if (strLine.length() > 0) { + Card card = new Card(strLine); + CardsList.add(card); + ParsedList.add(card.getMultiverseId()); + } + } + //Close the input stream + in.close(); + } catch (Exception e) {//Catch exception if any + System.err.println("Error: " + e.getMessage()); + } + } + + public static void main(String[] args) throws IOException, InterruptedException { + List sets = new ArrayList(); +// sets.add("Alara Reborn"); +// sets.add("Alliances"); +// sets.add("Antiquities"); +// sets.add("Apocalypse"); +// sets.add("Arabian Nights"); +// sets.add("Archenemy"); +// sets.add("Battle Royale Box Set"); +// sets.add("Beatdown Box Set"); +// sets.add("Betrayers of Kamigawa"); +// sets.add("Champions of Kamigawa"); +// sets.add("Chronicles"); +// sets.add("Classic Sixth Edition"); +// sets.add("Coldsnap"); +// sets.add("Conflux"); +// sets.add("Darksteel"); +// sets.add("Dissension"); +// sets.add("Duel Decks: Divine vs. Demonic"); +// sets.add("Duel Decks: Elspeth vs. Tezzeret"); +// sets.add("Duel Decks: Elves vs. Goblins"); +// sets.add("Duel Decks: Garruk vs. Liliana"); +// sets.add("Duel Decks: Jace vs. Chandra"); +// sets.add("Duel Decks: Knights vs. Dragons"); +// sets.add("Duel Decks: Phyrexia vs. the Coalition"); +// sets.add("Eighth Edition"); +// sets.add("Eventide"); +// sets.add("Exodus"); +// sets.add("Fallen Empires"); +// sets.add("Fifth Dawn"); +// sets.add("Fifth Edition"); +// sets.add("Fourth Edition"); +// sets.add("From the Vault: Dragons"); +// sets.add("From the Vault: Exiled"); +// sets.add("From the Vault: Relics"); +// sets.add("Future Sight"); +// sets.add("Guildpact"); +// sets.add("Homelands"); +// sets.add("Ice Age"); +// sets.add("Invasion"); +// sets.add("Judgment"); +// sets.add("Legends"); +// sets.add("Legions"); +// sets.add("Limited Edition Alpha"); +// sets.add("Limited Edition Beta"); +// sets.add("Lorwyn"); +// sets.add("Magic 2010"); +// sets.add("Magic 2011"); +// sets.add("Magic 2012"); +// sets.add("Masters Edition"); +// sets.add("Masters Edition II"); +// sets.add("Masters Edition III"); +// sets.add("Masters Edition IV"); +// sets.add("Mercadian Masques"); +// sets.add("Mirage"); +// sets.add("Mirrodin"); +// sets.add("Mirrodin Besieged"); +// sets.add("Morningtide"); +// sets.add("Nemesis"); +// sets.add("New Phyrexia"); +// sets.add("Ninth Edition"); +// sets.add("Odyssey"); +// sets.add("Onslaught"); +// sets.add("Planar Chaos"); +// sets.add("Planechase"); +// sets.add("Planeshift"); +// sets.add("Portal"); +// sets.add("Portal Second Age"); +// sets.add("Portal Three Kingdoms"); +// sets.add("Premium Deck Series: Fire and Lightning"); +// sets.add("Premium Deck Series: Slivers"); +// sets.add("Promo set for Gatherer"); +// sets.add("Prophecy"); +// sets.add("Ravnica: City of Guilds"); +// sets.add("Revised Edition"); +// sets.add("Rise of the Eldrazi"); +// sets.add("Saviors of Kamigawa"); +// sets.add("Scars of Mirrodin"); +// sets.add("Scourge"); +// sets.add("Seventh Edition"); +// sets.add("Shadowmoor"); +// sets.add("Shards of Alara"); +// sets.add("Starter 1999"); +// sets.add("Starter 2000"); +// sets.add("Stronghold"); +// sets.add("Tempest"); +// sets.add("Tenth Edition"); +// sets.add("The Dark"); +// sets.add("Time Spiral"); +// sets.add("Time Spiral \"Timeshifted\""); +// sets.add("Torment"); +// sets.add("Unlimited Edition"); +// sets.add("Urza's Destiny"); +// sets.add("Urza's Legacy"); +// sets.add("Urza's Saga"); +// sets.add("Vanguard"); +// sets.add("Visions"); +// sets.add("Weatherlight"); +// sets.add("Worldwake"); +// sets.add("Zendikar"); +// sets.add("Magic: The Gathering-Commander"); + + +// sets.add("Unglued"); +// sets.add("Unhinged"); + + readCardsFromFile(); + + StringBuilder sb = new StringBuilder(); + int added = 0; + for (String set : sets) { + sb.append("|[\"").append(set.replace(" ", "+")).append("\"]"); + added++; + + if (added % 20 == 0 || added == sets.size()) { + int retries = 30; + boolean done = false; + while (retries > 0 && !done) { + String url = "http://gatherer.wizards.com/Pages/Search/Default.aspx?action=advanced&output=checklist&set=" + sb.toString(); + Connection connection = Jsoup.connect(url); + connection.timeout(300000); + Document doc = connection.get(); + System.out.println(url); + + Elements select = doc.select(".checklist .name a"); + if (!select.isEmpty()) { + for (Element element : select) { + Integer multiverseId = Integer.parseInt(element.attr("href").replace("../Card/Details.aspx?multiverseid=", "")); + if (!ParsedList.contains(multiverseId)) { + ParseQueue.add(multiverseId); + } + } + } + done = true; + } + + if (!done) { + System.out.println("Error accured"); + } + sb = new StringBuilder(); + Thread.sleep(1000); + } + } + + + Thread t = new ThreadStarter(); + t.start(); + } +} diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParseQueue.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParseQueue.java new file mode 100644 index 0000000000..d99ea042c6 --- /dev/null +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParseQueue.java @@ -0,0 +1,29 @@ +package north.gatherercrawler; + +import java.util.concurrent.ConcurrentLinkedQueue; + +/** + * + * @author North + */ +public class ParseQueue { + + private static final ParseQueue instance = new ParseQueue(); + private ConcurrentLinkedQueue queue; + + public ParseQueue() { + queue = new ConcurrentLinkedQueue(); + } + + public static void add(Integer element) { + instance.queue.add(element); + } + + public static Integer remove() { + return instance.queue.remove(); + } + + public static boolean isEmpty(){ + return instance.queue.isEmpty(); + } +} diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParsedList.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParsedList.java new file mode 100644 index 0000000000..2efe8253a7 --- /dev/null +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParsedList.java @@ -0,0 +1,25 @@ +package north.gatherercrawler; + +import java.util.concurrent.ConcurrentSkipListSet; + +/** + * + * @author North + */ +public class ParsedList { + + private static final ParsedList instance = new ParsedList(); + private ConcurrentSkipListSet list; + + public ParsedList() { + list = new ConcurrentSkipListSet(); + } + + public static void add(Integer element) { + instance.list.add(element); + } + + public static boolean contains(Integer value) { + return instance.list.contains(value); + } +} diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ThreadStarter.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ThreadStarter.java new file mode 100644 index 0000000000..d850d2459d --- /dev/null +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ThreadStarter.java @@ -0,0 +1,120 @@ +package north.gatherercrawler; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.util.Iterator; +import java.util.List; + +/** + * + * @author North + */ +public class ThreadStarter extends Thread { + + private static Integer threadsDone = 0; + private final Integer threads = 10; + + public static synchronized void threadDone() { + threadsDone++; + } + + private void writeCardsToFile() { + try { + FileWriter fstream = new FileWriter("cards-data.txt"); + BufferedWriter out = new BufferedWriter(fstream); + Iterator iterator = CardsList.iterator(); + while (iterator.hasNext()) { + out.write(iterator.next().toString()); + out.newLine(); + } + out.close(); + } catch (Exception e) { + System.err.println("Error: " + e.getMessage()); + } + } + + private void writeCardsToUtilFile() { + try { + FileWriter fstream = new FileWriter("mtg-cards-data.txt"); + BufferedWriter out = new BufferedWriter(fstream); + Iterator iterator = CardsList.iterator(); + while (iterator.hasNext()) { + Card card = iterator.next(); + StringBuilder sb = new StringBuilder(); + sb.append(card.getName()).append("|"); + sb.append(card.getExpansion()).append("|"); + sb.append(card.getCardNumber() != null ? card.getCardNumber() : "").append("|"); + + String rarity = card.getRarity() != null ? card.getRarity() : ""; + if (rarity.equalsIgnoreCase("Mythic Rare")) { + rarity = "M"; + } + if (rarity.equalsIgnoreCase("Rare")) { + rarity = "R"; + } + if (rarity.equalsIgnoreCase("Uncommon")) { + rarity = "U"; + } + if (rarity.equalsIgnoreCase("Common")) { + rarity = "C"; + } + if (rarity.equalsIgnoreCase("Basic Land")) { + rarity = "L"; + } + sb.append(rarity).append("|"); + List manaCost = card.getManaCost(); + for (String cost : manaCost) { + if (!cost.isEmpty()) { + sb.append("{").append(cost).append("}"); + } + } + sb.append("|"); + + sb.append(card.getTypes()).append("|"); + String pts = card.getPowerToughness(); + if (pts != null && pts.length() > 1) { + String[] pt = pts.split("/"); + sb.append(pt[0].trim()).append("|"); + sb.append(pt[1].trim()).append("|"); + } else { + sb.append("||"); + } + + List cardText = card.getCardText(); + for (int i = 0; i < cardText.size(); i++) { + sb.append(cardText.get(i)); + if (i < cardText.size() - 1) { + sb.append("$"); + } + } + sb.append("|"); + + out.write(sb.toString()); + out.newLine(); + } + out.close(); + } catch (Exception e) { + System.err.println("Error: " + e.getMessage()); + } + } + + @Override + public void run() { + for (int i = 0; i < threads; i++) { + Thread t = new CardParser(); + t.start(); + } + + while (threads != threadsDone) { + try { + synchronized (this) { + this.wait(5000); + } + } catch (InterruptedException ex) { + } + } + + writeCardsToFile(); + writeCardsToUtilFile(); + } +}