From d0a934f2dc8d8b3b97b2f9e73f7f5d29cb59a9a9 Mon Sep 17 00:00:00 2001 From: North Date: Sun, 22 Jan 2012 13:48:34 +0200 Subject: [PATCH] [GathererCrawler] updated to support flip cards, split cards and double sided cards. --- .../main/java/north/gatherercrawler/Card.java | 18 +- .../north/gatherercrawler/CardParser.java | 238 +++++++++++------- .../main/java/north/gatherercrawler/Main.java | 9 +- .../north/gatherercrawler/ThreadStarter.java | 30 ++- .../gatherercrawler/{ => util}/CardsList.java | 3 +- .../{ => util}/ParseQueue.java | 2 +- .../{ => util}/ParsedList.java | 2 +- 7 files changed, 190 insertions(+), 112 deletions(-) rename Utils/GathererCrawler/src/main/java/north/gatherercrawler/{ => util}/CardsList.java (87%) rename Utils/GathererCrawler/src/main/java/north/gatherercrawler/{ => util}/ParseQueue.java (94%) rename Utils/GathererCrawler/src/main/java/north/gatherercrawler/{ => util}/ParsedList.java (93%) diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Card.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Card.java index 3317049a16..0519a41d22 100644 --- a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Card.java +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Card.java @@ -22,13 +22,14 @@ public class Card implements Comparable { private String rarity; private String cardNumber; private String artist; + private Card otherSide; public Card(Integer multiverseId) { this.multiverseId = multiverseId; } public Card(String card) { - String[] split = card.split("\\|",13); + String[] split = card.split("\\|", 13); if (split[0].length() > 0) { multiverseId = Integer.parseInt(split[0]); } @@ -160,6 +161,14 @@ public class Card implements Comparable { this.types = types; } + public Card getOtherSide() { + return otherSide; + } + + public void setOtherSide(Card otherSide) { + this.otherSide = otherSide; + } + @Override public String toString() { StringBuilder sb = new StringBuilder(); @@ -193,10 +202,15 @@ public class Card implements Comparable { sb.append(rarity != null ? rarity : "").append("|"); sb.append(cardNumber != null ? cardNumber : "").append("|"); sb.append(artist != null ? artist : ""); + + if (otherSide != null) { + sb.append("\n").append(otherSide.toString()); + } return sb.toString(); } public int compareTo(Card o) { - return this.multiverseId.compareTo(o.getMultiverseId()); + int idCompareResult = this.multiverseId.compareTo(o.getMultiverseId()); + return idCompareResult == 0 ? this.cardNumber.compareTo(o.getCardNumber()) : idCompareResult; } } diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardParser.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardParser.java index 2b97ead046..448c94745a 100644 --- a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardParser.java +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardParser.java @@ -5,6 +5,9 @@ import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import north.gatherercrawler.util.CardsList; +import north.gatherercrawler.util.ParseQueue; +import north.gatherercrawler.util.ParsedList; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -22,11 +25,10 @@ public class CardParser extends Thread { private boolean parseCard(Integer multiverseId) { String url = "http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid=" + multiverseId; - Card card = new Card(multiverseId); + Card card; Document doc = null; - int retries = 30; boolean done = false; - while (retries > 0 && !done) { + while (!done) { try { Connection connection = Jsoup.connect(url); connection.timeout(20000); @@ -41,113 +43,155 @@ public class CardParser extends Thread { } try { - Elements select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContentHeader_subtitleDisplay"); - String cardName = ""; - String selectorModifier = ""; + Elements select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value"); if (!select.isEmpty()) { - cardName = select.get(0).text().trim(); - } + card = extractCardData(doc, "", multiverseId); + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rightCol ul li a"); - select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value"); - if (!select.isEmpty()) { - card.setName(select.get(0).text().trim()); + // for multi-part cards + if (!select.isEmpty()) { + String href = select.attr("href"); + url = "http://gatherer.wizards.com/Pages/Card/Details.aspx" + href.substring(href.indexOf("?")); + + done = false; + while (!done) { + try { + Connection connection = Jsoup.connect(url); + connection.timeout(20000); + doc = connection.get(); + } catch (IOException ex) { + } + done = true; + } + if (!done) { + System.out.println("Card get exception: " + multiverseId); + } else { + card.setCardNumber(card.getCardNumber() + "b"); + Card cardSide = extractCardData(doc, "", multiverseId); + cardSide.setCardNumber(cardSide.getCardNumber() + "a"); + cardSide.setOtherSide(card); + card = cardSide; + } + } } else { - card.setName(cardName); - select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl05_nameRow .value"); - if (!select.isEmpty() && select.get(0).text().trim().equals(cardName)) { - selectorModifier = "_ctl05"; - } else { - selectorModifier = "_ctl06"; + // for flip / double sided cards + card = extractCardData(doc, "_ctl05", multiverseId); + if (card == null) { + return false; } - } - - select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_manaRow .value img"); - List manaCost = new ArrayList(); - if (!select.isEmpty()) { - for (Element element : select) { - manaCost.add(element.attr("src").replace("/Handlers/Image.ashx?size=medium&name=", "").replace("&type=symbol", "").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", "")); + card.setOtherSide(extractCardData(doc, "_ctl06", multiverseId)); + if (card.getOtherSide() == null) { + return false; } } - card.setManaCost(manaCost); - - select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cmcRow .value"); - if (!select.isEmpty()) { - card.setConvertedManaCost(Integer.parseInt(select.get(0).text().trim())); - } - - select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_typeRow .value"); - if (!select.isEmpty()) { - card.setTypes(select.get(0).text().trim()); - } - - select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_textRow .value .cardtextbox"); - List cardText = new ArrayList(); - if (!select.isEmpty()) { - for (Element element : select) { - cardText.add(element.html().trim().replace("\"[\\d\\w\\s]+?\"", "").replace("\n", "").replace(""", "\"")); - } - } - card.setCardText(cardText); - - select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_FlavorText .cardtextbox"); - List flavorText = new ArrayList(); - if (!select.isEmpty()) { - for (Element element : select) { - flavorText.add(element.html().trim().replace(""", "\"").replace("", "").replace("", "")); - } - } - card.setFlavorText(flavorText); - - select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ptRow .value"); - if (!select.isEmpty()) { - card.setPowerToughness(select.get(0).text().trim()); - } - - select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_currentSetSymbol a"); - if (!select.isEmpty()) { - card.setExpansion(select.get(1).text().trim()); - } - - select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_rarityRow .value span"); - if (!select.isEmpty()) { - card.setRarity(select.get(0).text().trim()); - } - - select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_otherSetsValue a"); - List otherSets = new ArrayList(); - if (!select.isEmpty()) { - for (Element element : select) { - otherSets.add(Integer.parseInt(element.attr("href").replace("Details.aspx?multiverseid=", ""))); - } - } -// card.setOtherSets(otherSets); - for (Integer otherSet : otherSets) { - if (!ParsedList.contains(otherSet)) { - ParseQueue.add(otherSet); - } - } - - select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_numberRow .value"); - if (!select.isEmpty()) { - card.setCardNumber(select.get(0).text().trim()); - } - - select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ArtistCredit a"); - if (!select.isEmpty()) { - card.setArtist(select.get(0).text().trim()); - } } catch (Exception e) { return false; } + + if (card == null) { + return false; + } + CardsList.add(card); + return true; + } + + private Card extractCardData(Document doc, String selectorModifier, Integer id) throws NumberFormatException { + Elements select; + + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cardImage"); + Integer multiverseId = null; + if (!select.isEmpty()) { + Pattern pattern = Pattern.compile("(?<=multiverseid=)\\d+"); + Matcher matcher = pattern.matcher(select.get(0).attr("src")); + if (matcher.find()) { + multiverseId = Integer.parseInt(matcher.group()); + } + } + if (multiverseId == null) { + return null; + } + + Card card = new Card(multiverseId); + + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_nameRow .value"); + if (!select.isEmpty()) { + card.setName(select.get(0).text().trim()); + } + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_manaRow .value img"); + List manaCost = new ArrayList(); + if (!select.isEmpty()) { + for (Element element : select) { + manaCost.add(element.attr("src").replace("/Handlers/Image.ashx?size=medium&name=", "").replace("&type=symbol", "").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", "")); + } + } + card.setManaCost(manaCost); + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cmcRow .value"); + if (!select.isEmpty()) { + card.setConvertedManaCost(Integer.parseInt(select.get(0).text().trim())); + } + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_typeRow .value"); + if (!select.isEmpty()) { + card.setTypes(select.get(0).text().trim()); + } + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_textRow .value .cardtextbox"); + List cardText = new ArrayList(); + if (!select.isEmpty()) { + for (Element element : select) { + cardText.add(element.html().trim().replace("\"[\\d\\w\\s]+?\"", "").replace("\n", "").replace(""", "\"")); + } + } + card.setCardText(cardText); + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_FlavorText .cardtextbox"); + List flavorText = new ArrayList(); + if (!select.isEmpty()) { + for (Element element : select) { + flavorText.add(element.html().trim().replace(""", "\"").replace("", "").replace("", "")); + } + } + card.setFlavorText(flavorText); + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ptRow .value"); + if (!select.isEmpty()) { + card.setPowerToughness(select.get(0).text().trim()); + } + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_currentSetSymbol a"); + if (!select.isEmpty()) { + card.setExpansion(select.get(1).text().trim()); + } + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_rarityRow .value span"); + if (!select.isEmpty()) { + card.setRarity(select.get(0).text().trim()); + } + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_otherSetsValue a"); + List otherSets = new ArrayList(); + if (!select.isEmpty()) { + for (Element element : select) { + otherSets.add(Integer.parseInt(element.attr("href").replace("Details.aspx?multiverseid=", ""))); + } + } + // card.setOtherSets(otherSets); + for (Integer otherSet : otherSets) { + if (!ParsedList.contains(otherSet)) { + ParseQueue.add(otherSet); + } + } + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_numberRow .value"); + if (!select.isEmpty()) { + card.setCardNumber(select.get(0).text().trim()); + } + select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ArtistCredit a"); + if (!select.isEmpty()) { + card.setArtist(select.get(0).text().trim()); + } + if (card.getCardNumber() == null) { - url = "http://magiccards.info/query?q=" + card.getName().replace(' ', '+'); + String url = "http://magiccards.info/query?q=" + card.getName().replace(' ', '+'); try { Connection connection = Jsoup.connect(url); connection.timeout(20000); doc = connection.get(); - Elements select = doc.select("small a:contains(" + card.getExpansion() + ")"); + select = doc.select("small a:contains(" + card.getExpansion() + ")"); if (!select.isEmpty()) { Matcher matcher = patternUrl.matcher(select.get(0).attr("href")); matcher.find(); @@ -166,7 +210,7 @@ public class CardParser extends Thread { } if (card.getCardNumber() == null) { - Elements select = doc.select("p a:contains(" + card.getExpansion() + ")"); + select = doc.select("p a:contains(" + card.getExpansion() + ")"); if (!select.isEmpty()) { Matcher matcher = patternUrl.matcher(select.get(0).attr("href")); matcher.find(); @@ -183,8 +227,8 @@ public class CardParser extends Thread { System.out.println("Card number missing: " + card.getName()); } } - CardsList.add(card); - return true; + + return card; } @Override diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Main.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Main.java index 0cae3e9dd2..8d785e612a 100644 --- a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Main.java +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/Main.java @@ -1,12 +1,11 @@ package north.gatherercrawler; -import java.io.BufferedReader; -import java.io.DataInputStream; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; +import java.io.*; import java.util.ArrayList; import java.util.List; +import north.gatherercrawler.util.CardsList; +import north.gatherercrawler.util.ParseQueue; +import north.gatherercrawler.util.ParsedList; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ThreadStarter.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ThreadStarter.java index 15f728e335..3fd770e29c 100644 --- a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ThreadStarter.java +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ThreadStarter.java @@ -2,8 +2,8 @@ package north.gatherercrawler; import java.io.BufferedWriter; import java.io.FileWriter; -import java.util.Iterator; -import java.util.List; +import java.util.*; +import north.gatherercrawler.util.CardsList; /** * @@ -13,16 +13,35 @@ public class ThreadStarter extends Thread { private static Integer threadsDone = 0; private final Integer threads = 10; + private List sortedCards; public static synchronized void threadDone() { threadsDone++; } + private void updateSortedCards() { + if (sortedCards == null) { + sortedCards = new ArrayList(); + Iterator iterator = CardsList.iterator(); + while (iterator.hasNext()) { + sortedCards.add(iterator.next()); + } + + Collections.sort(sortedCards, new Comparator() { + + public int compare(Card o1, Card o2) { + int expansionCompare = o1.getExpansion().compareTo(o2.getExpansion()); + return expansionCompare != 0 ? expansionCompare : o1.getCardNumber().compareTo(o2.getCardNumber()); + } + }); + } + } + private void writeCardsToFile() { try { FileWriter fstream = new FileWriter("cards-data.txt"); BufferedWriter out = new BufferedWriter(fstream); - Iterator iterator = CardsList.iterator(); + Iterator iterator = sortedCards.iterator(); while (iterator.hasNext()) { out.write(iterator.next().toString()); out.newLine(); @@ -37,7 +56,7 @@ public class ThreadStarter extends Thread { try { FileWriter fstream = new FileWriter("mtg-cards-data.txt"); BufferedWriter out = new BufferedWriter(fstream); - Iterator iterator = CardsList.iterator(); + Iterator iterator = sortedCards.iterator(); while (iterator.hasNext()) { Card card = iterator.next(); StringBuilder sb = new StringBuilder(); @@ -79,7 +98,7 @@ public class ThreadStarter extends Thread { } else { sb.append("||"); } - + List cardText = card.getCardText(); for (int i = 0; i < cardText.size(); i++) { sb.append(cardText.get(i)); @@ -114,6 +133,7 @@ public class ThreadStarter extends Thread { } } + updateSortedCards(); writeCardsToFile(); writeCardsToUtilFile(); } diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardsList.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/util/CardsList.java similarity index 87% rename from Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardsList.java rename to Utils/GathererCrawler/src/main/java/north/gatherercrawler/util/CardsList.java index 41bf308e03..7e6ea0086a 100644 --- a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/CardsList.java +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/util/CardsList.java @@ -1,7 +1,8 @@ -package north.gatherercrawler; +package north.gatherercrawler.util; import java.util.Iterator; import java.util.concurrent.ConcurrentSkipListSet; +import north.gatherercrawler.Card; /** * diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParseQueue.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/util/ParseQueue.java similarity index 94% rename from Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParseQueue.java rename to Utils/GathererCrawler/src/main/java/north/gatherercrawler/util/ParseQueue.java index d99ea042c6..c45b763095 100644 --- a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParseQueue.java +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/util/ParseQueue.java @@ -1,4 +1,4 @@ -package north.gatherercrawler; +package north.gatherercrawler.util; import java.util.concurrent.ConcurrentLinkedQueue; diff --git a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParsedList.java b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/util/ParsedList.java similarity index 93% rename from Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParsedList.java rename to Utils/GathererCrawler/src/main/java/north/gatherercrawler/util/ParsedList.java index 2efe8253a7..caa39038d7 100644 --- a/Utils/GathererCrawler/src/main/java/north/gatherercrawler/ParsedList.java +++ b/Utils/GathererCrawler/src/main/java/north/gatherercrawler/util/ParsedList.java @@ -1,4 +1,4 @@ -package north.gatherercrawler; +package north.gatherercrawler.util; import java.util.concurrent.ConcurrentSkipListSet;