Skip to content

Commit 6539851

Browse files
authored
Revert "Design changes and recommendations"
1 parent 95733f0 commit 6539851

21 files changed

+372
-483
lines changed

pom.xml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,18 +40,18 @@
4040
<artifactId>spring-boot-starter-data-jpa</artifactId>
4141
</dependency>
4242

43+
<!--<dependency>
44+
<groupId>org.springframework.boot</groupId>
45+
<artifactId>spring-boot-starter-tomcat</artifactId>
46+
<scope>provided</scope>
47+
</dependency>-->
48+
4349
<dependency>
4450
<groupId>org.postgresql</groupId>
4551
<artifactId>postgresql</artifactId>
4652
<version>9.4.1211.jre7</version>
4753
</dependency>
4854

49-
<dependency>
50-
<groupId>org.projectlombok</groupId>
51-
<artifactId>lombok</artifactId>
52-
<version>1.16.10</version>
53-
</dependency>
54-
5555
<dependency>
5656
<groupId>org.springframework.boot</groupId>
5757
<artifactId>spring-boot-starter-test</artifactId>

src/main/java/com/olegshan/controllers/ParseController.java

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,9 @@
2020
public class ParseController {
2121

2222
private static final int PAGE_SIZE = 40;
23-
private JobService jobService;
2423

2524
@Autowired
26-
public ParseController(JobService jobService) {
27-
this.jobService = jobService;
28-
}
25+
JobService jobService;
2926

3027
@RequestMapping(value = "/", method = RequestMethod.GET)
3128
public ModelAndView showJobs(@RequestParam(value = "page", required = false) Integer page) {
Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,15 @@
11
package com.olegshan.entity;
22

3-
import lombok.Data;
4-
53
import javax.persistence.Column;
64
import javax.persistence.Entity;
75
import javax.persistence.Id;
86
import java.time.LocalDateTime;
9-
10-
import static java.time.format.DateTimeFormatter.ofPattern;
7+
import java.time.format.DateTimeFormatter;
118

129
/**
1310
* Created by olegshan on 24.09.2016.
1411
*/
1512
@Entity
16-
@Data
1713
public class Job {
1814

1915
@Id
@@ -27,16 +23,68 @@ public class Job {
2723
private LocalDateTime date;
2824
private String dateToDisplay;
2925

30-
public Job(String url, String title, String description, String company, String source, LocalDateTime date) {
26+
public Job() {
27+
}
28+
29+
public Job(String title, String description, String company, String source, String url, LocalDateTime date) {
30+
this.title = title;
31+
this.description = description;
32+
this.company = company;
33+
this.source = source;
3134
this.url = url;
35+
this.date = date;
36+
}
37+
38+
public String getTitle() {
39+
return title;
40+
}
41+
42+
public void setTitle(String title) {
3243
this.title = title;
44+
}
45+
46+
public String getDescription() {
47+
return description;
48+
}
49+
50+
public void setDescription(String description) {
3351
this.description = description;
52+
}
53+
54+
public String getCompany() {
55+
return company;
56+
}
57+
58+
public void setCompany(String company) {
3459
this.company = company;
60+
}
61+
62+
public String getSource() {
63+
return source;
64+
}
65+
66+
public void setSource(String source) {
3567
this.source = source;
68+
}
69+
70+
public String getUrl() {
71+
return url;
72+
}
73+
74+
public void setUrl(String url) {
75+
this.url = url;
76+
}
77+
78+
public LocalDateTime getDate() {
79+
return date;
80+
}
81+
82+
public void setDate(LocalDateTime date) {
3683
this.date = date;
3784
}
3885

3986
public String getDateToDisplay() {
40-
return date.format(ofPattern("d MMMM"));
87+
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("d MMMM");
88+
return date.format(formatter);
4189
}
4290
}
Lines changed: 201 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,208 @@
11
package com.olegshan.parser;
22

3-
import com.olegshan.sites.JobSite;
3+
import com.olegshan.entity.Job;
4+
import com.olegshan.service.JobService;
5+
import com.olegshan.sites.*;
6+
import com.olegshan.tools.MonthsTools;
7+
import org.jsoup.Jsoup;
8+
import org.jsoup.nodes.Document;
9+
import org.jsoup.nodes.Element;
10+
import org.jsoup.select.Elements;
11+
import org.slf4j.Logger;
12+
import org.slf4j.LoggerFactory;
13+
import org.springframework.beans.factory.annotation.Autowired;
14+
import org.springframework.stereotype.Component;
15+
16+
import java.io.IOException;
17+
import java.time.LocalDate;
18+
import java.time.LocalDateTime;
19+
import java.time.LocalTime;
20+
import java.time.ZoneId;
421

522
/**
623
* Created by olegshan on 03.10.2016.
724
*/
8-
public interface Parser {
9-
void parse(JobSite jobSite);
25+
@Component
26+
public class Parser {
27+
28+
private static final Logger LOGGER = LoggerFactory.getLogger(Parser.class);
29+
@Autowired
30+
private JobService jobService;
31+
32+
public void parse(JobSite jobSite) {
33+
34+
Document doc = getDoc(jobSite.getSiteUrl());
35+
Elements jobBlocks = getJobBlocks(jobSite, doc);
36+
37+
for (Element job : jobBlocks) {
38+
Elements titleBlock = getTitleBlock(jobSite, job);
39+
String url = jobSite.getUrlPrefix() + titleBlock.attr("href");
40+
String title = getTitle(titleBlock);
41+
String description = getDescription(jobSite, job);
42+
String company = getCompany(jobSite, job, url);
43+
LocalDateTime date = getDate(jobSite, job, url, titleBlock);
44+
45+
Job parsedJob = new Job(title, description, company, jobSite.getSiteName(), url, date);
46+
jobService.save(parsedJob);
47+
}
48+
LOGGER.info("Parsing of {} completed", jobSite.getSiteName());
49+
}
50+
51+
private Document getDoc(String siteUrl) {
52+
try {
53+
return Jsoup.connect(siteUrl).userAgent("Mozilla").timeout(0).get();
54+
} catch (IOException e) {
55+
LOGGER.error("Connecting to {} failed", siteUrl);
56+
throw new RuntimeException("Connection failed to " + siteUrl);
57+
}
58+
}
59+
60+
private Elements getJobBlocks(JobSite jobSite, Document doc) {
61+
String[] jobBox = jobSite.getJobBox();
62+
if (jobSite instanceof WorkUa) {
63+
return doc.getElementsByAttributeValueStarting(jobBox[0], jobBox[1]);
64+
} else if (jobSite instanceof RabotaUa) {
65+
return getJobBlocksForRabotaUa(doc, jobBox);
66+
} else {
67+
return doc.getElementsByAttributeValue(jobBox[0], jobBox[1]);
68+
}
69+
}
70+
71+
private Elements getJobBlocksForRabotaUa(Document doc, String[] jobBox) {
72+
Elements jobBlocks = new Elements();
73+
for (int i = 1; i < jobBox.length; i++) {
74+
Elements jobElements = doc.getElementsByAttributeValue(jobBox[0], jobBox[i]);
75+
if (jobElements != null && !jobElements.isEmpty()) {
76+
jobBlocks.addAll(jobElements);
77+
}
78+
}
79+
return jobBlocks;
80+
}
81+
82+
private Elements getTitleBlock(JobSite jobSite, Element job) {
83+
String[] titleBox = jobSite.getTitleBox();
84+
if (jobSite instanceof WorkUa) {
85+
return job.getElementsByTag("a");
86+
} else {
87+
return job.getElementsByAttributeValue(titleBox[0], titleBox[1]);
88+
}
89+
}
90+
91+
private String getTitle(Elements titleBlock) {
92+
String title = titleBlock.text();
93+
if (title.endsWith("Горячая")) {
94+
title = title.substring(0, title.length() - "Горячая".length());
95+
}
96+
return title;
97+
}
98+
99+
private String getDescription(JobSite jobSite, Element job) {
100+
String[] descriptionData = jobSite.getDescriptionData();
101+
return job.getElementsByAttributeValue(descriptionData[0], descriptionData[1]).text();
102+
}
103+
104+
private LocalDateTime getDate(JobSite jobSite, Element job, String url, Elements titleBlock) {
105+
String[] dateData = jobSite.getDateData();
106+
String dateLine;
107+
if (jobSite instanceof DouUa) {
108+
Document dateDoc = getDoc(url);
109+
dateLine = dateDoc.getElementsByAttributeValue(dateData[0], dateData[1]).text();
110+
return getDateByLine(jobSite, dateLine);
111+
} else if (jobSite instanceof RabotaUa) {
112+
return getDateForRabotaUa(url);
113+
} else {
114+
if (jobSite instanceof WorkUa) {
115+
dateLine = titleBlock.attr("title");
116+
} else {
117+
dateLine = job.getElementsByAttributeValue(dateData[0], dateData[1]).text();
118+
}
119+
return getDateByLine(jobSite, dateLine);
120+
}
121+
}
122+
123+
private LocalDateTime getDateByLine(JobSite jobSite, String dateLine) {
124+
String[] dateParts;
125+
int year;
126+
int month;
127+
int day;
128+
String split = jobSite.getSplit();
129+
if (jobSite instanceof JobsUa) {
130+
dateLine = dateLine.substring(0, 10);
131+
}
132+
if (jobSite instanceof WorkUa) {
133+
dateLine = dateLine.substring(dateLine.length() - 8);
134+
}
135+
dateParts = dateLine.split(split);
136+
MonthsTools.removeZero(dateParts);
137+
day = Integer.parseInt(dateParts[0]);
138+
if (jobSite instanceof HeadHunterUa) {
139+
year = LocalDate.now(ZoneId.of("Europe/Athens")).getYear();
140+
} else year = Integer.parseInt(dateParts[2]);
141+
if (jobSite instanceof WorkUa) {
142+
year = year + 2000;
143+
}
144+
if (jobSite instanceof DouUa || jobSite instanceof HeadHunterUa) {
145+
month = MonthsTools.MONTHS.get(dateParts[1].toLowerCase());
146+
} else month = Integer.parseInt(dateParts[1]);
147+
return LocalDate.of(year, month, day).atTime(getTime());
148+
}
149+
150+
private LocalDateTime getDateForRabotaUa(String url) {
151+
/*
152+
* There are several problems here.
153+
* First: there are two types of date tags, used on rabota.ua on different pages: "d-date" and "datePosted".
154+
* Second: sometimes date format is dd.mm.yyyy and sometimes — yyyy-mm-dd.
155+
* Third: sometimes there is no date at all.
156+
*/
157+
Document dateDoc = getDoc(url);
158+
String dateLine;
159+
String[] dateParts;
160+
int year;
161+
int month;
162+
int day;
163+
164+
Elements dateElements = dateDoc.getElementsByAttributeValue("id", "d-date");
165+
if (!dateElements.isEmpty()) {
166+
dateLine = dateElements.get(0).getElementsByAttributeValue("class", "d-ph-value").text();
167+
} else {
168+
dateLine = dateDoc.getElementsByAttributeValue("itemprop", "datePosted").text();
169+
if (dateLine.length() == 0) {
170+
//no date at all, sometimes it happens
171+
LocalDateTime ldt = LocalDateTime.now(ZoneId.of("Europe/Athens"));
172+
LOGGER.debug("There was no date on Rabota.ua, return {}", ldt);
173+
return ldt;
174+
}
175+
}
176+
try {
177+
dateParts = dateLine.split("\\.");
178+
MonthsTools.removeZero(dateParts);
179+
year = Integer.parseInt(dateParts[2]);
180+
month = Integer.parseInt(dateParts[1]);
181+
day = Integer.parseInt(dateParts[0]);
182+
183+
} catch (ArrayIndexOutOfBoundsException e) {
184+
185+
dateParts = dateLine.split("-");
186+
MonthsTools.removeZero(dateParts);
187+
year = Integer.parseInt(dateParts[0]);
188+
month = Integer.parseInt(dateParts[1]);
189+
day = Integer.parseInt(dateParts[2]);
190+
}
191+
return LocalDate.of(year, month, day).atTime(getTime());
192+
}
193+
194+
private LocalTime getTime() {
195+
return LocalTime.now(ZoneId.of("Europe/Athens"));
196+
}
197+
198+
private String getCompany(JobSite jobSite, Element job, String url) {
199+
String[] companyData = jobSite.getCompanyData();
200+
if (jobSite instanceof JobsUa || jobSite instanceof WorkUa) {
201+
Document jobDoc = getDoc(url);
202+
Elements companyBlock = jobDoc.getElementsByAttributeValue(companyData[0], companyData[1]);
203+
return companyBlock.get(0).getElementsByTag("a").first().text();
204+
} else {
205+
return job.getElementsByAttributeValue(companyData[0], companyData[1]).text();
206+
}
207+
}
10208
}
Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,33 @@
11
package com.olegshan.parser;
22

3-
import com.olegshan.sites.JobSite;
3+
import com.olegshan.sites.*;
44
import org.springframework.beans.factory.annotation.Autowired;
55
import org.springframework.scheduling.annotation.Scheduled;
66
import org.springframework.stereotype.Component;
77

8-
import java.util.List;
9-
108
/**
119
* Created by olegshan on 06.10.2016.
1210
*/
1311
@Component
1412
public class Performer {
15-
private List<JobSite> sites;
16-
private Parser parser;
1713

1814
@Autowired
19-
public Performer(List<JobSite> sites, Parser parser) {
20-
this.sites = sites;
21-
this.parser = parser;
22-
}
15+
private DouUa douUa;
16+
@Autowired
17+
private RabotaUa rabotaUa;
18+
@Autowired
19+
private JobsUa jobsUa;
20+
@Autowired
21+
private WorkUa workUa;
22+
@Autowired
23+
private HeadHunterUa headHunterUa;
2324

2425
@Scheduled(cron = "0 1 7-23 * * *", zone = "Europe/Athens")
2526
public void perform() {
26-
for (JobSite jobSite : sites)
27-
parser.parse(jobSite);
27+
douUa.parse();
28+
headHunterUa.parse();
29+
jobsUa.parse();
30+
rabotaUa.parse();
31+
workUa.parse();
2832
}
2933
}

0 commit comments

Comments
 (0)