|
1 | 1 | package com.olegshan.parser;
|
2 | 2 |
|
3 |
| -import com.olegshan.sites.JobSite; |
| 3 | +import com.olegshan.entity.Job; |
| 4 | +import com.olegshan.service.JobService; |
| 5 | +import com.olegshan.sites.*; |
| 6 | +import com.olegshan.tools.MonthsTools; |
| 7 | +import org.jsoup.Jsoup; |
| 8 | +import org.jsoup.nodes.Document; |
| 9 | +import org.jsoup.nodes.Element; |
| 10 | +import org.jsoup.select.Elements; |
| 11 | +import org.slf4j.Logger; |
| 12 | +import org.slf4j.LoggerFactory; |
| 13 | +import org.springframework.beans.factory.annotation.Autowired; |
| 14 | +import org.springframework.stereotype.Component; |
| 15 | + |
| 16 | +import java.io.IOException; |
| 17 | +import java.time.LocalDate; |
| 18 | +import java.time.LocalDateTime; |
| 19 | +import java.time.LocalTime; |
| 20 | +import java.time.ZoneId; |
4 | 21 |
|
5 | 22 | /**
|
6 | 23 | * Created by olegshan on 03.10.2016.
|
7 | 24 | */
|
8 |
| -public interface Parser { |
9 |
| - void parse(JobSite jobSite); |
| 25 | +@Component |
| 26 | +public class Parser { |
| 27 | + |
| 28 | + private static final Logger LOGGER = LoggerFactory.getLogger(Parser.class); |
| 29 | + @Autowired |
| 30 | + private JobService jobService; |
| 31 | + |
| 32 | + public void parse(JobSite jobSite) { |
| 33 | + |
| 34 | + Document doc = getDoc(jobSite.getSiteUrl()); |
| 35 | + Elements jobBlocks = getJobBlocks(jobSite, doc); |
| 36 | + |
| 37 | + for (Element job : jobBlocks) { |
| 38 | + Elements titleBlock = getTitleBlock(jobSite, job); |
| 39 | + String url = jobSite.getUrlPrefix() + titleBlock.attr("href"); |
| 40 | + String title = getTitle(titleBlock); |
| 41 | + String description = getDescription(jobSite, job); |
| 42 | + String company = getCompany(jobSite, job, url); |
| 43 | + LocalDateTime date = getDate(jobSite, job, url, titleBlock); |
| 44 | + |
| 45 | + Job parsedJob = new Job(title, description, company, jobSite.getSiteName(), url, date); |
| 46 | + jobService.save(parsedJob); |
| 47 | + } |
| 48 | + LOGGER.info("Parsing of {} completed", jobSite.getSiteName()); |
| 49 | + } |
| 50 | + |
| 51 | + private Document getDoc(String siteUrl) { |
| 52 | + try { |
| 53 | + return Jsoup.connect(siteUrl).userAgent("Mozilla").timeout(0).get(); |
| 54 | + } catch (IOException e) { |
| 55 | + LOGGER.error("Connecting to {} failed", siteUrl); |
| 56 | + throw new RuntimeException("Connection failed to " + siteUrl); |
| 57 | + } |
| 58 | + } |
| 59 | + |
| 60 | + private Elements getJobBlocks(JobSite jobSite, Document doc) { |
| 61 | + String[] jobBox = jobSite.getJobBox(); |
| 62 | + if (jobSite instanceof WorkUa) { |
| 63 | + return doc.getElementsByAttributeValueStarting(jobBox[0], jobBox[1]); |
| 64 | + } else if (jobSite instanceof RabotaUa) { |
| 65 | + return getJobBlocksForRabotaUa(doc, jobBox); |
| 66 | + } else { |
| 67 | + return doc.getElementsByAttributeValue(jobBox[0], jobBox[1]); |
| 68 | + } |
| 69 | + } |
| 70 | + |
| 71 | + private Elements getJobBlocksForRabotaUa(Document doc, String[] jobBox) { |
| 72 | + Elements jobBlocks = new Elements(); |
| 73 | + for (int i = 1; i < jobBox.length; i++) { |
| 74 | + Elements jobElements = doc.getElementsByAttributeValue(jobBox[0], jobBox[i]); |
| 75 | + if (jobElements != null && !jobElements.isEmpty()) { |
| 76 | + jobBlocks.addAll(jobElements); |
| 77 | + } |
| 78 | + } |
| 79 | + return jobBlocks; |
| 80 | + } |
| 81 | + |
| 82 | + private Elements getTitleBlock(JobSite jobSite, Element job) { |
| 83 | + String[] titleBox = jobSite.getTitleBox(); |
| 84 | + if (jobSite instanceof WorkUa) { |
| 85 | + return job.getElementsByTag("a"); |
| 86 | + } else { |
| 87 | + return job.getElementsByAttributeValue(titleBox[0], titleBox[1]); |
| 88 | + } |
| 89 | + } |
| 90 | + |
| 91 | + private String getTitle(Elements titleBlock) { |
| 92 | + String title = titleBlock.text(); |
| 93 | + if (title.endsWith("Горячая")) { |
| 94 | + title = title.substring(0, title.length() - "Горячая".length()); |
| 95 | + } |
| 96 | + return title; |
| 97 | + } |
| 98 | + |
| 99 | + private String getDescription(JobSite jobSite, Element job) { |
| 100 | + String[] descriptionData = jobSite.getDescriptionData(); |
| 101 | + return job.getElementsByAttributeValue(descriptionData[0], descriptionData[1]).text(); |
| 102 | + } |
| 103 | + |
| 104 | + private LocalDateTime getDate(JobSite jobSite, Element job, String url, Elements titleBlock) { |
| 105 | + String[] dateData = jobSite.getDateData(); |
| 106 | + String dateLine; |
| 107 | + if (jobSite instanceof DouUa) { |
| 108 | + Document dateDoc = getDoc(url); |
| 109 | + dateLine = dateDoc.getElementsByAttributeValue(dateData[0], dateData[1]).text(); |
| 110 | + return getDateByLine(jobSite, dateLine); |
| 111 | + } else if (jobSite instanceof RabotaUa) { |
| 112 | + return getDateForRabotaUa(url); |
| 113 | + } else { |
| 114 | + if (jobSite instanceof WorkUa) { |
| 115 | + dateLine = titleBlock.attr("title"); |
| 116 | + } else { |
| 117 | + dateLine = job.getElementsByAttributeValue(dateData[0], dateData[1]).text(); |
| 118 | + } |
| 119 | + return getDateByLine(jobSite, dateLine); |
| 120 | + } |
| 121 | + } |
| 122 | + |
| 123 | + private LocalDateTime getDateByLine(JobSite jobSite, String dateLine) { |
| 124 | + String[] dateParts; |
| 125 | + int year; |
| 126 | + int month; |
| 127 | + int day; |
| 128 | + String split = jobSite.getSplit(); |
| 129 | + if (jobSite instanceof JobsUa) { |
| 130 | + dateLine = dateLine.substring(0, 10); |
| 131 | + } |
| 132 | + if (jobSite instanceof WorkUa) { |
| 133 | + dateLine = dateLine.substring(dateLine.length() - 8); |
| 134 | + } |
| 135 | + dateParts = dateLine.split(split); |
| 136 | + MonthsTools.removeZero(dateParts); |
| 137 | + day = Integer.parseInt(dateParts[0]); |
| 138 | + if (jobSite instanceof HeadHunterUa) { |
| 139 | + year = LocalDate.now(ZoneId.of("Europe/Athens")).getYear(); |
| 140 | + } else year = Integer.parseInt(dateParts[2]); |
| 141 | + if (jobSite instanceof WorkUa) { |
| 142 | + year = year + 2000; |
| 143 | + } |
| 144 | + if (jobSite instanceof DouUa || jobSite instanceof HeadHunterUa) { |
| 145 | + month = MonthsTools.MONTHS.get(dateParts[1].toLowerCase()); |
| 146 | + } else month = Integer.parseInt(dateParts[1]); |
| 147 | + return LocalDate.of(year, month, day).atTime(getTime()); |
| 148 | + } |
| 149 | + |
| 150 | + private LocalDateTime getDateForRabotaUa(String url) { |
| 151 | + /* |
| 152 | + * There are several problems here. |
| 153 | + * First: there are two types of date tags, used on rabota.ua on different pages: "d-date" and "datePosted". |
| 154 | + * Second: sometimes date format is dd.mm.yyyy and sometimes — yyyy-mm-dd. |
| 155 | + * Third: sometimes there is no date at all. |
| 156 | + */ |
| 157 | + Document dateDoc = getDoc(url); |
| 158 | + String dateLine; |
| 159 | + String[] dateParts; |
| 160 | + int year; |
| 161 | + int month; |
| 162 | + int day; |
| 163 | + |
| 164 | + Elements dateElements = dateDoc.getElementsByAttributeValue("id", "d-date"); |
| 165 | + if (!dateElements.isEmpty()) { |
| 166 | + dateLine = dateElements.get(0).getElementsByAttributeValue("class", "d-ph-value").text(); |
| 167 | + } else { |
| 168 | + dateLine = dateDoc.getElementsByAttributeValue("itemprop", "datePosted").text(); |
| 169 | + if (dateLine.length() == 0) { |
| 170 | + //no date at all, sometimes it happens |
| 171 | + LocalDateTime ldt = LocalDateTime.now(ZoneId.of("Europe/Athens")); |
| 172 | + LOGGER.debug("There was no date on Rabota.ua, return {}", ldt); |
| 173 | + return ldt; |
| 174 | + } |
| 175 | + } |
| 176 | + try { |
| 177 | + dateParts = dateLine.split("\\."); |
| 178 | + MonthsTools.removeZero(dateParts); |
| 179 | + year = Integer.parseInt(dateParts[2]); |
| 180 | + month = Integer.parseInt(dateParts[1]); |
| 181 | + day = Integer.parseInt(dateParts[0]); |
| 182 | + |
| 183 | + } catch (ArrayIndexOutOfBoundsException e) { |
| 184 | + |
| 185 | + dateParts = dateLine.split("-"); |
| 186 | + MonthsTools.removeZero(dateParts); |
| 187 | + year = Integer.parseInt(dateParts[0]); |
| 188 | + month = Integer.parseInt(dateParts[1]); |
| 189 | + day = Integer.parseInt(dateParts[2]); |
| 190 | + } |
| 191 | + return LocalDate.of(year, month, day).atTime(getTime()); |
| 192 | + } |
| 193 | + |
| 194 | + private LocalTime getTime() { |
| 195 | + return LocalTime.now(ZoneId.of("Europe/Athens")); |
| 196 | + } |
| 197 | + |
| 198 | + private String getCompany(JobSite jobSite, Element job, String url) { |
| 199 | + String[] companyData = jobSite.getCompanyData(); |
| 200 | + if (jobSite instanceof JobsUa || jobSite instanceof WorkUa) { |
| 201 | + Document jobDoc = getDoc(url); |
| 202 | + Elements companyBlock = jobDoc.getElementsByAttributeValue(companyData[0], companyData[1]); |
| 203 | + return companyBlock.get(0).getElementsByTag("a").first().text(); |
| 204 | + } else { |
| 205 | + return job.getElementsByAttributeValue(companyData[0], companyData[1]).text(); |
| 206 | + } |
| 207 | + } |
10 | 208 | }
|
0 commit comments