2011-07-23 16:18:37 +00:00
|
|
|
package north.gatherercrawler;
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.List;
|
2011-08-05 20:11:25 +00:00
|
|
|
import java.util.regex.Matcher;
|
|
|
|
import java.util.regex.Pattern;
|
2012-01-22 11:48:34 +00:00
|
|
|
import north.gatherercrawler.util.CardsList;
|
|
|
|
import north.gatherercrawler.util.ParseQueue;
|
|
|
|
import north.gatherercrawler.util.ParsedList;
|
2011-07-23 16:18:37 +00:00
|
|
|
import org.jsoup.Connection;
|
|
|
|
import org.jsoup.Jsoup;
|
|
|
|
import org.jsoup.nodes.Document;
|
|
|
|
import org.jsoup.nodes.Element;
|
|
|
|
import org.jsoup.select.Elements;
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
2011-08-28 14:04:21 +00:00
|
|
|
* @author North
|
2011-07-23 16:18:37 +00:00
|
|
|
*/
|
|
|
|
public class CardParser extends Thread {
|
|
|
|
|
2011-08-05 20:11:25 +00:00
|
|
|
private static final Pattern patternPrint = Pattern.compile("(?<=#)[\\w\\d]+?(?= )");
|
|
|
|
private static final Pattern patternUrl = Pattern.compile("(?<=/)[\\w\\d]+?(?=\\.html)");
|
|
|
|
|
2011-07-23 16:18:37 +00:00
|
|
|
private boolean parseCard(Integer multiverseId) {
|
|
|
|
String url = "http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid=" + multiverseId;
|
2012-01-22 11:48:34 +00:00
|
|
|
Card card;
|
2011-07-23 16:18:37 +00:00
|
|
|
Document doc = null;
|
|
|
|
boolean done = false;
|
2012-01-22 11:48:34 +00:00
|
|
|
while (!done) {
|
2011-07-23 16:18:37 +00:00
|
|
|
try {
|
|
|
|
Connection connection = Jsoup.connect(url);
|
|
|
|
connection.timeout(20000);
|
|
|
|
doc = connection.get();
|
|
|
|
} catch (IOException ex) {
|
|
|
|
}
|
|
|
|
done = true;
|
|
|
|
}
|
|
|
|
if (!done) {
|
|
|
|
System.out.println("Card get exception: " + multiverseId);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
2012-01-22 11:48:34 +00:00
|
|
|
Elements select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value");
|
2011-09-25 08:54:24 +00:00
|
|
|
if (!select.isEmpty()) {
|
2012-01-22 11:48:34 +00:00
|
|
|
card = extractCardData(doc, "", multiverseId);
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rightCol ul li a");
|
2011-09-25 08:54:24 +00:00
|
|
|
|
2012-01-22 11:48:34 +00:00
|
|
|
// for multi-part cards
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
String href = select.attr("href");
|
|
|
|
url = "http://gatherer.wizards.com/Pages/Card/Details.aspx" + href.substring(href.indexOf("?"));
|
|
|
|
|
|
|
|
done = false;
|
|
|
|
while (!done) {
|
|
|
|
try {
|
|
|
|
Connection connection = Jsoup.connect(url);
|
|
|
|
connection.timeout(20000);
|
|
|
|
doc = connection.get();
|
|
|
|
} catch (IOException ex) {
|
|
|
|
}
|
|
|
|
done = true;
|
|
|
|
}
|
|
|
|
if (!done) {
|
|
|
|
System.out.println("Card get exception: " + multiverseId);
|
|
|
|
} else {
|
|
|
|
card.setCardNumber(card.getCardNumber() + "b");
|
|
|
|
Card cardSide = extractCardData(doc, "", multiverseId);
|
|
|
|
cardSide.setCardNumber(cardSide.getCardNumber() + "a");
|
|
|
|
cardSide.setOtherSide(card);
|
|
|
|
card = cardSide;
|
|
|
|
}
|
|
|
|
}
|
2011-09-25 08:54:24 +00:00
|
|
|
} else {
|
2012-01-22 11:48:34 +00:00
|
|
|
// for flip / double sided cards
|
|
|
|
card = extractCardData(doc, "_ctl05", multiverseId);
|
|
|
|
if (card == null) {
|
|
|
|
return false;
|
2011-09-25 08:54:24 +00:00
|
|
|
}
|
2012-01-22 11:48:34 +00:00
|
|
|
card.setOtherSide(extractCardData(doc, "_ctl06", multiverseId));
|
|
|
|
if (card.getOtherSide() == null) {
|
|
|
|
return false;
|
2011-07-23 16:18:37 +00:00
|
|
|
}
|
|
|
|
}
|
2012-01-22 11:48:34 +00:00
|
|
|
} catch (Exception e) {
|
|
|
|
return false;
|
|
|
|
}
|
2011-07-23 16:18:37 +00:00
|
|
|
|
|
|
|
|
2012-01-22 11:48:34 +00:00
|
|
|
if (card == null) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
CardsList.add(card);
|
|
|
|
return true;
|
|
|
|
}
|
2011-07-23 16:18:37 +00:00
|
|
|
|
2012-01-22 11:48:34 +00:00
|
|
|
private Card extractCardData(Document doc, String selectorModifier, Integer id) throws NumberFormatException {
|
|
|
|
Elements select;
|
2011-07-23 16:18:37 +00:00
|
|
|
|
2012-01-22 11:48:34 +00:00
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cardImage");
|
|
|
|
Integer multiverseId = null;
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
Pattern pattern = Pattern.compile("(?<=multiverseid=)\\d+");
|
|
|
|
Matcher matcher = pattern.matcher(select.get(0).attr("src"));
|
|
|
|
if (matcher.find()) {
|
|
|
|
multiverseId = Integer.parseInt(matcher.group());
|
2011-07-23 16:18:37 +00:00
|
|
|
}
|
2012-01-22 11:48:34 +00:00
|
|
|
}
|
|
|
|
if (multiverseId == null) {
|
|
|
|
return null;
|
|
|
|
}
|
2011-07-23 16:18:37 +00:00
|
|
|
|
2012-01-22 11:48:34 +00:00
|
|
|
Card card = new Card(multiverseId);
|
2011-07-23 16:18:37 +00:00
|
|
|
|
2012-01-22 11:48:34 +00:00
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_nameRow .value");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setName(select.get(0).text().trim());
|
|
|
|
}
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_manaRow .value img");
|
|
|
|
List<String> manaCost = new ArrayList<String>();
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
for (Element element : select) {
|
|
|
|
manaCost.add(element.attr("src").replace("/Handlers/Image.ashx?size=medium&name=", "").replace("&type=symbol", "").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", ""));
|
2011-07-23 16:18:37 +00:00
|
|
|
}
|
2012-01-22 11:48:34 +00:00
|
|
|
}
|
|
|
|
card.setManaCost(manaCost);
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cmcRow .value");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setConvertedManaCost(Integer.parseInt(select.get(0).text().trim()));
|
|
|
|
}
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_typeRow .value");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setTypes(select.get(0).text().trim());
|
|
|
|
}
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_textRow .value .cardtextbox");
|
|
|
|
List<String> cardText = new ArrayList<String>();
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
for (Element element : select) {
|
|
|
|
cardText.add(element.html().trim().replace("<img src=\"/Handlers/Image.ashx?size=small&name=", "{").replace("&type=symbol", "}").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", "").replace("\n", "").replace(""", "\""));
|
2011-07-23 16:18:37 +00:00
|
|
|
}
|
2012-01-22 11:48:34 +00:00
|
|
|
}
|
|
|
|
card.setCardText(cardText);
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_FlavorText .cardtextbox");
|
|
|
|
List<String> flavorText = new ArrayList<String>();
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
for (Element element : select) {
|
|
|
|
flavorText.add(element.html().trim().replace(""", "\"").replace("<i>", "").replace("</i>", ""));
|
2011-07-23 16:18:37 +00:00
|
|
|
}
|
2012-01-22 11:48:34 +00:00
|
|
|
}
|
|
|
|
card.setFlavorText(flavorText);
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ptRow .value");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setPowerToughness(select.get(0).text().trim());
|
|
|
|
}
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_currentSetSymbol a");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setExpansion(select.get(1).text().trim());
|
|
|
|
}
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_rarityRow .value span");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setRarity(select.get(0).text().trim());
|
|
|
|
}
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_otherSetsValue a");
|
|
|
|
List<Integer> otherSets = new ArrayList<Integer>();
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
for (Element element : select) {
|
|
|
|
otherSets.add(Integer.parseInt(element.attr("href").replace("Details.aspx?multiverseid=", "")));
|
2011-07-23 16:18:37 +00:00
|
|
|
}
|
2012-01-22 11:48:34 +00:00
|
|
|
}
|
|
|
|
// card.setOtherSets(otherSets);
|
|
|
|
for (Integer otherSet : otherSets) {
|
|
|
|
if (!ParsedList.contains(otherSet)) {
|
|
|
|
ParseQueue.add(otherSet);
|
2011-07-23 16:18:37 +00:00
|
|
|
}
|
2012-01-22 11:48:34 +00:00
|
|
|
}
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_numberRow .value");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setCardNumber(select.get(0).text().trim());
|
|
|
|
}
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ArtistCredit a");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setArtist(select.get(0).text().trim());
|
2011-07-23 16:18:37 +00:00
|
|
|
}
|
|
|
|
|
2011-08-05 20:11:25 +00:00
|
|
|
if (card.getCardNumber() == null) {
|
2012-01-22 11:48:34 +00:00
|
|
|
String url = "http://magiccards.info/query?q=" + card.getName().replace(' ', '+');
|
2011-08-05 20:11:25 +00:00
|
|
|
try {
|
|
|
|
Connection connection = Jsoup.connect(url);
|
|
|
|
connection.timeout(20000);
|
|
|
|
doc = connection.get();
|
|
|
|
|
2012-01-22 11:48:34 +00:00
|
|
|
select = doc.select("small a:contains(" + card.getExpansion() + ")");
|
2011-08-05 20:11:25 +00:00
|
|
|
if (!select.isEmpty()) {
|
|
|
|
Matcher matcher = patternUrl.matcher(select.get(0).attr("href"));
|
|
|
|
matcher.find();
|
|
|
|
card.setCardNumber(matcher.group());
|
|
|
|
} else {
|
|
|
|
select = doc.select("small b:contains(#)");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
Matcher matcher = patternPrint.matcher(select.get(0).html());
|
|
|
|
matcher.find();
|
|
|
|
card.setCardNumber(matcher.group());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} catch (IOException ex) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (card.getCardNumber() == null) {
|
2012-01-22 11:48:34 +00:00
|
|
|
select = doc.select("p a:contains(" + card.getExpansion() + ")");
|
2011-08-28 14:04:21 +00:00
|
|
|
if (!select.isEmpty()) {
|
|
|
|
Matcher matcher = patternUrl.matcher(select.get(0).attr("href"));
|
|
|
|
matcher.find();
|
|
|
|
card.setCardNumber(matcher.group());
|
|
|
|
} else {
|
|
|
|
select = doc.select("p b:contains(#)");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
Matcher matcher = patternPrint.matcher(select.get(0).html());
|
|
|
|
matcher.find();
|
|
|
|
card.setCardNumber(matcher.group());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (card.getCardNumber() == null) {
|
|
|
|
System.out.println("Card number missing: " + card.getName());
|
|
|
|
}
|
2011-08-05 20:11:25 +00:00
|
|
|
}
|
2012-01-22 11:48:34 +00:00
|
|
|
|
|
|
|
return card;
|
2011-07-23 16:18:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void run() {
|
|
|
|
while (!ParseQueue.isEmpty()) {
|
|
|
|
Integer multiverseId = ParseQueue.remove();
|
|
|
|
if (!ParsedList.contains(multiverseId)) {
|
|
|
|
ParsedList.add(multiverseId);
|
|
|
|
parseCard(multiverseId);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ThreadStarter.threadDone();
|
|
|
|
}
|
|
|
|
}
|